Import Libraries
import pandas
Load iris dataset from link
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", names=names)
Get the number of rows (instances) and the number of columns (attributes)
print(dataset.shape)
(150, 5)
Get the first n rows of the dataframe using
# pandas.set_option('display.max_columns', None)
print(dataset.head(5))
sepal-length sepal-width petal-length petal-width class
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
Print statistical summary of the data
print(dataset.describe())
sepal-length sepal-width petal-length petal-width
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.054000 3.758667 1.198667
std 0.828066 0.433594 1.764420 0.763161
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000
Get class distribution i.e. how many instances of each class
print(dataset.groupby('class').size())
class
Iris-setosa 50
Iris-versicolor 50
Iris-virginica 50
dtype: int64
Display Box plots
import matplotlib.pyplot as plt
dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)
print("Displaying univariate plots...")
plt.show()
Displaying univariate plots...
Plot Histogram
dataset.hist()
plt.show()
Displaying scatter plot
from pandas.plotting import scatter_matrix
scatter_matrix(dataset)
plt.show()
As seen from the plot, the following observations are made with regards to correlation: the order of the correlation from weakest to strongest:
- Sepal length and Sepal width (little to none)
- Sepal length and petal width (weak)
- Sepal length and petal length (weak)
- Petal length and Petal width (strongest)
Split dataset into 80% training; 20% validation
from sklearn import model_selection
array = dataset.values
X = array[:,0:4] # x values i.e. features
Y = array[:,4] # y value i.e. class or Label
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
scoring = 'accuracy'
print("Number of samples")
print("-------------------")
print("Training - ", len(X_train))
print("Validation - ", len(X_validation))
Number of samples
-------------------
Training - 120
Validation - 30
Build and evaluate five models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
Evaluate each model in turn
import warnings
warnings.filterwarnings("ignore", category=FutureWarning) #Disable future warnings
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
LR: 0.966667 (0.040825)
LDA: 0.975000 (0.038188)
KNN: 0.983333 (0.033333)
CART: 0.975000 (0.038188)
NB: 0.975000 (0.053359)
SVM: 0.991667 (0.025000)
Compare Model Performance
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
Use a voting system to predict on the validation data that is, the class of a feature set is the most voted class from LR, NB and LDA
from sklearn.metrics import accuracy_score
#Use Logistic Regression to make prediction
print("\nMake predictions using Logistic Regression")
print("-----------------------------------------------------------")
lr = LogisticRegression()
lr.fit(X_train, Y_train)
predictions_lr = lr.predict(X_validation)
#print(predictions_lr)
print(accuracy_score(Y_validation, predictions_lr))
#Use Naïve Bayes to make predictions
print("\nMake predictions using Naïve Bayes")
print("-----------------------------------------------------------")
nb = GaussianNB()
nb.fit(X_train, Y_train)
predictions_nb = nb.predict(X_validation)
#print(predictions_nb)
print(accuracy_score(Y_validation, predictions_nb))
# Use Linear Discriminant Analysis (LDA)
print("\nMake predictions using Linear Discriminant Analysis")
print("-----------------------------------------------------------")
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, Y_train)
predictions_lda = lda.predict(X_validation)
#print(predictions_lda)
print(accuracy_score(Y_validation, predictions_lda))
# Use Statistical Mode to cast vote
print("\nMake predictions using majority voting with LR, NB and LDA:")
print("-----------------------------------------------------------")
voted_predictions = []
import statistics
for i in range(0, 30):
voted_predictions.append(statistics.mode([predictions_lr[i], predictions_nb[i], predictions_lda[i]]))
print(accuracy_score(Y_validation, voted_predictions))
Make predictions using Logistic Regression
-----------------------------------------------------------
0.8
Make predictions using Naïve Bayes
-----------------------------------------------------------
0.8333333333333334
Make predictions using Linear Discriminant Analysis
-----------------------------------------------------------
0.9666666666666667
Make predictions using majority voting with LR, NB and LDA:
-----------------------------------------------------------
0.9333333333333333
The voting system provides the best averaging performance of all three models: although the accuracy score is less than the accuracy of the LDA model, it’s performance is better than the other two models used.