291 строка
9.6 KiB
Python
291 строка
9.6 KiB
Python
'''
|
|
This script performs the basic process for applying a machine learning
|
|
algorithm to a dataset using Python libraries.
|
|
|
|
The four steps are:
|
|
1. Download a dataset (using pandas)
|
|
2. Process the numeric data (using numpy)
|
|
3. Train and evaluate learners (using scikit-learn)
|
|
4. Plot and compare results (using matplotlib)
|
|
|
|
|
|
The data is downloaded from URL, which is defined below. As is normal
|
|
for machine learning problems, the nature of the source data affects
|
|
the entire solution. When you change URL to refer to your own data, you
|
|
will need to review the data processing steps to ensure they remain
|
|
correct.
|
|
|
|
============
|
|
Example Data
|
|
============
|
|
The example is from https://archive-beta.ics.uci.edu/ml/datasets/spambase
|
|
It contains pre-processed metrics, such as the frequency of certain
|
|
words and letters, from a collection of emails. A classification for
|
|
each one indicating 'spam' or 'not spam' is in the final column.
|
|
See the linked page for full details of the data set.
|
|
|
|
This script uses three classifiers to predict the class of an email
|
|
based on the metrics. These are not representative of modern spam
|
|
detection systems.
|
|
'''
|
|
|
|
# Remember to update the script for the new data when you change this URL
|
|
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
|
|
|
|
# Uncomment this call when using matplotlib to generate images
|
|
# rather than displaying interactive UI.
|
|
#import matplotlib
|
|
#matplotlib.use('Agg')
|
|
|
|
from pandas import read_table
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
|
|
try:
|
|
# [OPTIONAL] Seaborn makes plots nicer
|
|
import seaborn
|
|
except ImportError:
|
|
pass
|
|
|
|
# =====================================================================
|
|
|
|
def download_data():
|
|
'''
|
|
Downloads the data for this script into a pandas DataFrame.
|
|
'''
|
|
|
|
# If your data is in an Excel file, install 'xlrd' and use
|
|
# pandas.read_excel instead of read_table
|
|
#from pandas import read_excel
|
|
#frame = read_excel(URL)
|
|
|
|
# If your data is in a private Azure blob, install 'azure-storage' and use
|
|
# BlockBlobService.get_blob_to_path() with read_table() or read_excel()
|
|
#from azure.storage.blob import BlockBlobService
|
|
#service = BlockBlobService(ACCOUNT_NAME, ACCOUNT_KEY)
|
|
#service.get_blob_to_path(container_name, blob_name, 'my_data.csv')
|
|
#frame = read_table('my_data.csv', ...
|
|
|
|
frame = read_table(
|
|
URL,
|
|
|
|
# Uncomment if the file needs to be decompressed
|
|
#compression='gzip',
|
|
#compression='bz2',
|
|
|
|
# Specify the file encoding
|
|
# Latin-1 is common for data from US sources
|
|
encoding='latin-1',
|
|
#encoding='utf-8', # UTF-8 is also common
|
|
|
|
# Specify the separator in the data
|
|
sep=',', # comma separated values
|
|
#sep='\t', # tab separated values
|
|
#sep=' ', # space separated values
|
|
|
|
# Ignore spaces after the separator
|
|
skipinitialspace=True,
|
|
|
|
# Generate row labels from each row number
|
|
index_col=None,
|
|
#index_col=0, # use the first column as row labels
|
|
#index_col=-1, # use the last column as row labels
|
|
|
|
# Generate column headers row from each column number
|
|
header=None,
|
|
#header=0, # use the first line as headers
|
|
|
|
# Use manual headers and skip the first row in the file
|
|
#header=0,
|
|
#names=['col1', 'col2', ...],
|
|
)
|
|
|
|
# Return a subset of the columns
|
|
#return frame[['col1', 'col4', ...]]
|
|
|
|
# Return the entire frame
|
|
return frame
|
|
|
|
|
|
# =====================================================================
|
|
|
|
|
|
def get_features_and_labels(frame):
|
|
'''
|
|
Transforms and scales the input data and returns numpy arrays for
|
|
training and testing inputs and targets.
|
|
'''
|
|
|
|
# Replace missing values with 0.0, or we can use
|
|
# scikit-learn to calculate missing values (below)
|
|
#frame[frame.isnull()] = 0.0
|
|
|
|
# Convert values to floats
|
|
arr = np.array(frame, dtype=np.float)
|
|
|
|
# Use the last column as the target value
|
|
X, y = arr[:, :-1], arr[:, -1]
|
|
# To use the first column instead, change the index value
|
|
#X, y = arr[:, 1:], arr[:, 0]
|
|
|
|
# Use 80% of the data for training; test against the rest
|
|
from sklearn.model_selection import train_test_split
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
|
|
|
|
# sklearn.pipeline.make_pipeline could also be used to chain
|
|
# processing and classification into a black box, but here we do
|
|
# them separately.
|
|
|
|
# If values are missing we could impute them from the training data
|
|
#from sklearn.preprocessing import Imputer
|
|
#imputer = Imputer(strategy='mean')
|
|
#imputer.fit(X_train)
|
|
#X_train = imputer.transform(X_train)
|
|
#X_test = imputer.transform(X_test)
|
|
|
|
# Normalize the attribute values to mean=0 and variance=1
|
|
from sklearn.preprocessing import StandardScaler
|
|
scaler = StandardScaler()
|
|
# To scale to a specified range, use MinMaxScaler
|
|
#from sklearn.preprocessing import MinMaxScaler
|
|
#scaler = MinMaxScaler(feature_range=(0, 1))
|
|
|
|
# Fit the scaler based on the training data, then apply the same
|
|
# scaling to both training and test sets.
|
|
scaler.fit(X_train)
|
|
X_train = scaler.transform(X_train)
|
|
X_test = scaler.transform(X_test)
|
|
|
|
# Return the training and test sets
|
|
return X_train, X_test, y_train, y_test
|
|
|
|
|
|
# =====================================================================
|
|
|
|
|
|
def evaluate_classifier(X_train, X_test, y_train, y_test):
|
|
'''
|
|
Run multiple times with different classifiers to get an idea of the
|
|
relative performance of each configuration.
|
|
|
|
Returns a sequence of tuples containing:
|
|
(title, precision, recall)
|
|
for each learner.
|
|
'''
|
|
|
|
# Import some classifiers to test
|
|
from sklearn.svm import LinearSVC, NuSVC
|
|
from sklearn.ensemble import AdaBoostClassifier
|
|
|
|
# We will calculate the P-R curve for each classifier
|
|
from sklearn.metrics import precision_recall_curve, f1_score
|
|
|
|
# Here we create classifiers with default parameters. These need
|
|
# to be adjusted to obtain optimal performance on your data set.
|
|
|
|
# Test the linear support vector classifier
|
|
classifier = LinearSVC(C=1)
|
|
# Fit the classifier
|
|
classifier.fit(X_train, y_train)
|
|
score = f1_score(y_test, classifier.predict(X_test))
|
|
# Generate the P-R curve
|
|
y_prob = classifier.decision_function(X_test)
|
|
precision, recall, _ = precision_recall_curve(y_test, y_prob)
|
|
# Include the score in the title
|
|
yield 'Linear SVC (F1 score={:.3f})'.format(score), precision, recall
|
|
|
|
# Test the Nu support vector classifier
|
|
classifier = NuSVC(kernel='rbf', nu=0.5, gamma=1e-3)
|
|
# Fit the classifier
|
|
classifier.fit(X_train, y_train)
|
|
score = f1_score(y_test, classifier.predict(X_test))
|
|
# Generate the P-R curve
|
|
y_prob = classifier.decision_function(X_test)
|
|
precision, recall, _ = precision_recall_curve(y_test, y_prob)
|
|
# Include the score in the title
|
|
yield 'NuSVC (F1 score={:.3f})'.format(score), precision, recall
|
|
|
|
# Test the Ada boost classifier
|
|
classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.0, algorithm='SAMME.R')
|
|
# Fit the classifier
|
|
classifier.fit(X_train, y_train)
|
|
score = f1_score(y_test, classifier.predict(X_test))
|
|
# Generate the P-R curve
|
|
y_prob = classifier.decision_function(X_test)
|
|
precision, recall, _ = precision_recall_curve(y_test, y_prob)
|
|
# Include the score in the title
|
|
yield 'Ada Boost (F1 score={:.3f})'.format(score), precision, recall
|
|
|
|
# =====================================================================
|
|
|
|
|
|
def plot(results):
|
|
'''
|
|
Create a plot comparing multiple learners.
|
|
|
|
`results` is a list of tuples containing:
|
|
(title, precision, recall)
|
|
|
|
All the elements in results will be plotted.
|
|
'''
|
|
|
|
# Plot the precision-recall curves
|
|
|
|
fig = plt.figure(figsize=(6, 6))
|
|
fig.canvas.set_window_title('Classifying data from ' + URL)
|
|
|
|
for label, precision, recall in results:
|
|
plt.plot(recall, precision, label=label)
|
|
|
|
plt.title('Precision-Recall Curves')
|
|
plt.xlabel('Precision')
|
|
plt.ylabel('Recall')
|
|
plt.legend(loc='lower left')
|
|
|
|
# Let matplotlib improve the layout
|
|
plt.tight_layout()
|
|
|
|
# ==================================
|
|
# Display the plot in interactive UI
|
|
plt.show()
|
|
|
|
# To save the plot to an image file, use savefig()
|
|
#plt.savefig('plot.png')
|
|
|
|
# Open the image file with the default image viewer
|
|
#import subprocess
|
|
#subprocess.Popen('plot.png', shell=True)
|
|
|
|
# To save the plot to an image in memory, use BytesIO and savefig()
|
|
# This can then be written to any stream-like object, such as a
|
|
# file or HTTP response.
|
|
#from io import BytesIO
|
|
#img_stream = BytesIO()
|
|
#plt.savefig(img_stream, fmt='png')
|
|
#img_bytes = img_stream.getvalue()
|
|
#print('Image is {} bytes - {!r}'.format(len(img_bytes), img_bytes[:8] + b'...'))
|
|
|
|
# Closing the figure allows matplotlib to release the memory used.
|
|
plt.close()
|
|
|
|
|
|
# =====================================================================
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# Download the data set from URL
|
|
print("Downloading data from {}".format(URL))
|
|
frame = download_data()
|
|
|
|
# Process data into feature and label arrays
|
|
print("Processing {} samples with {} attributes".format(len(frame.index), len(frame.columns)))
|
|
X_train, X_test, y_train, y_test = get_features_and_labels(frame)
|
|
|
|
# Evaluate multiple classifiers on the data
|
|
print("Evaluating classifiers")
|
|
results = list(evaluate_classifier(X_train, X_test, y_train, y_test))
|
|
|
|
# Display the results
|
|
print("Plotting the results")
|
|
plot(results)
|