import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as plotly
import plotly.express as px
plotly.offline.init_notebook_mode()

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn import datasets
iris = datasets.load_iris(as_frame=True)
X, y = datasets.load_iris(return_X_y=True, as_frame=True)

df = iris.frame
df

iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

df['target'] = df['target'].apply(lambda x: 'virginica' if iris.target_names[x] == 'virginica' else 'non-virginica')
df

df.describe()

df_virginica = df[df['target'] == 'virginica']
df_non_virginica = df[df['target'] == 'non-virginica']

print('VIRGINICA')
display(df_virginica.describe())

VIRGINICA

print('NON-VIRGINICA')
display(df_non_virginica.describe())

NON-VIRGINICA

palette = sns.color_palette("husl", 2)

for feature in iris.feature_names:
    sns.histplot(data=df, x=feature, hue='target', multiple="dodge", shrink=.8, palette=palette)
    plt.show()

for feature in iris.feature_names:
    sns.histplot(
    df, x=feature, y="target", hue="target", legend=False, palette=palette
    )
    plt.show()

for feature in iris.feature_names:
    sns.boxplot(x='target', y=feature, data=df, palette=palette)
    plt.show()

for feature in iris.feature_names:
    sns.violinplot(x='target', y=feature, data=df, palette=palette)
    plt.show()

sns.scatterplot(data=df, x='sepal length (cm)', y='sepal width (cm)', hue='target', palette=palette)
plt.show()

sns.scatterplot(data=df, x='petal length (cm)', y='petal width (cm)', hue='target', palette=palette)
plt.show()

sns.scatterplot(data=df, x='sepal length (cm)', y='petal length (cm)', hue='target', palette=palette)
plt.show()

sns.scatterplot(data=df, x='sepal width (cm)', y='petal width (cm)', hue='target', palette=palette)
plt.show()

df_numeric = df.copy()
df_numeric['target'] = df_numeric['target'].map({'virginica': 1, 'non-virginica': 0})
corr = df_numeric.corr()
corr

sns.heatmap(corr, annot=True, cmap='coolwarm')

<Axes: >

df.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64

X = df.drop('target', axis=1)
y = df['target']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(X_train.shape, X_valid.shape, X_test.shape)

(120, 4) (15, 4) (15, 4)

sns.heatmap(corr, annot=True, cmap='coolwarm')

<Axes: >

feature_sets = {
    "One feature": ['petal width (cm)'],
    "Two features": ['petal width (cm)', 'petal length (cm)'],
    "Three features": ['petal width (cm)', 'petal length (cm)', 'sepal length (cm)'],
    "Four features": ['petal width (cm)', 'petal length (cm)', 'sepal length (cm)', 'sepal width (cm)']
}

models_dict = {}

results_dict = {}

for feature_set_name, feature_set in feature_sets.items():

    model = LogisticRegression()
    

    model.fit(X_train[feature_set], y_train)
    

    models_dict[feature_set_name] = model
    

    y_valid_pred_proba = model.predict_proba(X_valid[feature_set])[:, 1]
    y_valid_pred = model.predict(X_valid[feature_set])
    

    results = pd.DataFrame({
        'Instance number': X_valid.index,
        'Probability of predicting virginica': y_valid_pred_proba,
        'Actual prediction by the model': y_valid_pred,
        'Ground truth': y_valid
    })
    

    results_dict[feature_set_name] = results

for feature_set_name, results in results_dict.items():
    print(f"Results for model with {feature_set_name}:")
    display(results)

Results for model with One feature:

Results for model with Two features:

Results for model with Three features:

Results for model with Four features:

from sklearn.metrics import accuracy_score
accuracies = {}

for feature_set_name, results in results_dict.items():
    accuracy = accuracy_score(y_valid, results['Actual prediction by the model'])
    accuracies[feature_set_name] = accuracy
    print(f"Accuracy for model with {feature_set_name}: {accuracy:.2f}")

Accuracy for model with One feature: 1.00
Accuracy for model with Two features: 1.00
Accuracy for model with Three features: 1.00
Accuracy for model with Four features: 1.00

display(X_valid)

import matplotlib.pyplot as plt
import seaborn as sns

feature_set_name = "One feature" 
model = models_dict[feature_set_name]
feature_set = feature_sets[feature_set_name]

plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_valid[feature_set[0]], y=y_valid, hue=y_valid_pred)

boundary = -model.intercept_ / model.coef_

plt.axvline(x=boundary, color='red')

x_values = pd.DataFrame(np.linspace(X_valid[feature_set[0]].min(), X_valid[feature_set[0]].max(), 100), columns=[feature_set[0]])
y_values = model.predict_proba(x_values)[:, 1]
plt.plot(x_values, y_values, color='blue', alpha=0.2)


plt.title(f"Decision Boundary for {feature_set_name}")
plt.show()

feature_set_name = "Two features"
model = models_dict[feature_set_name]
feature_set = feature_sets[feature_set_name]

fig = px.scatter(X_valid, x=feature_set[0], y=feature_set[1],  color=y_valid_pred, color_discrete_map={0: 'blue', 1: 'red'})


MIN_FEATURE_VALUE = X_valid.min().min()
MAX_FEATURE_VALUE = X_valid.max().max()

decision_boundary_x1 = np.linspace(MIN_FEATURE_VALUE, MAX_FEATURE_VALUE, 10)
decision_boundary_x2 = -model.intercept_ / model.coef_[0][1] - model.coef_[0][0] / model.coef_[0][1] * decision_boundary_x1

fig.add_scatter(x=decision_boundary_x1, y=decision_boundary_x2, mode='lines', name='Decision Boundary', line=dict(color='red', width=1))

fig.update_layout(title_text=f'Decision Boundary for 2 features', width=800, height=400)
fig.show()

import plotly.graph_objects as go

feature_set_name = "Three features"
model = models_dict[feature_set_name]
feature_set = feature_sets[feature_set_name]

fig = go.Figure()

X_valid_0 = X_valid[y_valid_pred == 'non-virginica']
X_valid_1 = X_valid[y_valid_pred == 'virginica']

fig.add_trace(go.Scatter3d(
    x=X_valid_0[feature_set[0]], 
    y=X_valid_0[feature_set[1]], 
    z=X_valid_0[feature_set[2]], 
    mode='markers',
    marker=dict(
        size=5,
        color='green',
        opacity=0.8
    ),
    name='Category 0'
))

fig.add_trace(go.Scatter3d(
    x=X_valid_1[feature_set[0]], 
    y=X_valid_1[feature_set[1]], 
    z=X_valid_1[feature_set[2]], 
    mode='markers',
    marker=dict(
        size=5,
        color='purple',
        opacity=0.8
    ),
    name='Category 1'
))

x1, x2 = np.meshgrid(np.linspace(MIN_FEATURE_VALUE, MAX_FEATURE_VALUE, 10), np.linspace(MIN_FEATURE_VALUE, MAX_FEATURE_VALUE, 10))
decision_boundary_x3 = -model.intercept_ / model.coef_[0][2] - model.coef_[0][0] / model.coef_[0][2] * x1 - model.coef_[0][1] / model.coef_[0][2] * x2

fig.add_trace(go.Surface(x=x1, y=x2, z=decision_boundary_x3, colorscale='Reds', opacity=0.6))

fig.update_layout(
    title_text=f'Decision Boundary for 3 features', 
    width=1200, 
    height=800,
    scene=dict(
        xaxis_title='petal width (cm)',
        yaxis_title='petal length (cm)',
        zaxis_title='sepal length (cm)'
    )
)

camera = dict(
    eye=dict(x=2, y=2, z=1.1)
)

fig.update_layout(scene_camera=camera)

fig.show()

from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

precisions = {}

for feature_set_name, results in results_dict.items():
    precision = precision_score(y_valid, results['Actual prediction by the model'], pos_label='virginica')
    precisions[feature_set_name] = precision
    print(f"Precision for model with {feature_set_name}: {precision:.2f}")

Precision for model with One feature: 1.00
Precision for model with Two features: 1.00
Precision for model with Three features: 1.00
Precision for model with Four features: 1.00

recalls = {}

for feature_set_name, results in results_dict.items():
    recall = recall_score(y_valid, results['Actual prediction by the model'], pos_label='virginica')
    recalls[feature_set_name] = recall
    print(f"Recall score for model with {feature_set_name}: {recall:.2f}")

Recall score for model with One feature: 1.00
Recall score for model with Two features: 1.00
Recall score for model with Three features: 1.00
Recall score for model with Four features: 1.00

f1s = {}

for feature_set_name, results in results_dict.items():
    f1 = f1_score(y_valid, results['Actual prediction by the model'], pos_label='virginica')
    f1s[feature_set_name] = f1
    print(f"F1 Score for model with {feature_set_name}: {f1:.2f}")

F1 Score for model with One feature: 1.00
F1 Score for model with Two features: 1.00
F1 Score for model with Three features: 1.00
F1 Score for model with Four features: 1.00

confusion_matrices = {}

for feature_set_name, results in results_dict.items():
    confusion = confusion_matrix(y_valid, results['Actual prediction by the model'])
    confusion_matrices[feature_set_name] = confusion
    print(f"Confusion matrix for model with {feature_set_name}:\n {confusion}")

Confusion matrix for model with One feature:
 [[8 0]
 [0 7]]
Confusion matrix for model with Two features:
 [[8 0]
 [0 7]]
Confusion matrix for model with Three features:
 [[8 0]
 [0 7]]
Confusion matrix for model with Four features:
 [[8 0]
 [0 7]]

# we are using a 50% probability threshold.
def calculate_total_score(df):
    def calculate_score(row):
        prob = row['Probability of predicting virginica']
        ground_truth = row['Ground truth']

        if ground_truth == 'virginica':
            return prob - 0.5
        else:
            return 0.5 - prob

    df['Score'] = df.apply(calculate_score, axis=1)
    
    return df['Score'].sum()

total_scores = {feature_set_name: calculate_total_score(df) for feature_set_name, df in results_dict.items()}

total_scores = {feature_set_name: calculate_total_score(df) for feature_set_name, df in results_dict.items()}

for feature_set_name, total_score in total_scores.items():
    print(f"Total score for model with {feature_set_name}: {total_score}")

Total score for model with One feature: 5.463795534529218
Total score for model with Two features: 6.146823413627794
Total score for model with Three features: 6.030550287627612
Total score for model with Four features: 6.002378652450679

best_score = max(total_scores, key=total_scores.get)
best_model = models_dict[best_score]
best_model

LogisticRegression()

LogisticRegression()

# Predict the probabilities and classes on the test set using the best model
y_test_pred_proba = best_model.predict_proba(X_test[feature_sets[best_score]])[:, 1]
y_test_pred = best_model.predict(X_test[feature_sets[best_score]])

# Create a dataframe with the test results
test_results = pd.DataFrame({
    'Instance number': X_test.index,
    'Probability of predicting virginica': y_test_pred_proba,
    'Actual prediction by the model': y_test_pred,
    'Ground truth': y_test
})

display(test_results)

accuracy = accuracy_score(y_test, test_results['Actual prediction by the model'])
print(f"Accuracy for model with {feature_set_name}: {accuracy:.2f}")

Accuracy for model with Four features: 1.00

calculate_total_score(test_results)

6.249262208930103

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	target
0	5.1	3.5	1.4	0.2	0
1	4.9	3.0	1.4	0.2	0
2	4.7	3.2	1.3	0.2	0
3	4.6	3.1	1.5	0.2	0
4	5.0	3.6	1.4	0.2	0
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	2
146	6.3	2.5	5.0	1.9	2
147	6.5	3.0	5.2	2.0	2
148	6.2	3.4	5.4	2.3	2
149	5.9	3.0	5.1	1.8	2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	target
0	5.1	3.5	1.4	0.2	non-virginica
1	4.9	3.0	1.4	0.2	non-virginica
2	4.7	3.2	1.3	0.2	non-virginica
3	4.6	3.1	1.5	0.2	non-virginica
4	5.0	3.6	1.4	0.2	non-virginica
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	virginica
146	6.3	2.5	5.0	1.9	virginica
147	6.5	3.0	5.2	2.0	virginica
148	6.2	3.4	5.4	2.3	virginica
149	5.9	3.0	5.1	1.8	virginica

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
count	150.000000	150.000000	150.000000	150.000000
mean	5.843333	3.057333	3.758000	1.199333
std	0.828066	0.435866	1.765298	0.762238
min	4.300000	2.000000	1.000000	0.100000
25%	5.100000	2.800000	1.600000	0.300000
50%	5.800000	3.000000	4.350000	1.300000
75%	6.400000	3.300000	5.100000	1.800000
max	7.900000	4.400000	6.900000	2.500000

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
count	50.00000	50.000000	50.000000	50.00000
mean	6.58800	2.974000	5.552000	2.02600
std	0.63588	0.322497	0.551895	0.27465
min	4.90000	2.200000	4.500000	1.40000
25%	6.22500	2.800000	5.100000	1.80000
50%	6.50000	3.000000	5.550000	2.00000
75%	6.90000	3.175000	5.875000	2.30000
max	7.90000	3.800000	6.900000	2.50000

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
count	100.000000	100.000000	100.000000	100.000000
mean	5.471000	3.099000	2.861000	0.786000
std	0.641698	0.478739	1.449549	0.565153
min	4.300000	2.000000	1.000000	0.100000
25%	5.000000	2.800000	1.500000	0.200000
50%	5.400000	3.050000	2.450000	0.800000
75%	5.900000	3.400000	4.325000	1.300000
max	7.000000	4.400000	5.100000	1.800000

Framing the problem¶

Getting the data¶

Exploratory Data Analysis¶

Redefining the data¶

Exploring and Describing the data¶

Visualizing the Data¶

Let us first visualize our features individually.¶

Now let us visualize the features and targets against each other¶

Correlation Matrix¶

Data Preprocessing¶

Splitting the Data¶

Logistic Regression¶

Model Evaluation¶

Scoring the Models¶

Decision Boundary Graph¶

Failure Modes¶

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	target
0	5.1	3.5	1.4	0.2	0
1	4.9	3.0	1.4	0.2	0
2	4.7	3.2	1.3	0.2	0
3	4.6	3.1	1.5	0.2	0
4	5.0	3.6	1.4	0.2	0
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	2
146	6.3	2.5	5.0	1.9	2
147	6.5	3.0	5.2	2.0	2
148	6.2	3.4	5.4	2.3	2
149	5.9	3.0	5.1	1.8	2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	target
sepal length (cm)	1.000000	-0.117570	0.871754	0.817941	0.638020
sepal width (cm)	-0.117570	1.000000	-0.428440	-0.366126	-0.135645
petal length (cm)	0.871754	-0.428440	1.000000	0.962865	0.721011
petal width (cm)	0.817941	-0.366126	0.962865	1.000000	0.769445
target	0.638020	-0.135645	0.721011	0.769445	1.000000

	Instance number	Probability of predicting virginica	Actual prediction by the model	Ground truth
26	26	0.007232	non-virginica	non-virginica
18	18	0.004901	non-virginica	non-virginica
118	118	0.925283	virginica	virginica
145	145	0.925283	virginica	virginica
78	78	0.350797	non-virginica	non-virginica
127	127	0.636206	virginica	virginica
108	108	0.636206	virginica	virginica
55	55	0.198051	non-virginica	non-virginica
30	30	0.003318	non-virginica	non-virginica
29	29	0.003318	non-virginica	non-virginica
141	141	0.925283	virginica	virginica
110	110	0.792805	virginica	virginica
19	19	0.004901	non-virginica	non-virginica
132	132	0.893300	virginica	virginica
64	64	0.198051	non-virginica	non-virginica

	Instance number	Probability of predicting virginica	Actual prediction by the model	Ground truth
26	26	0.000014	non-virginica	non-virginica
18	18	0.000015	non-virginica	non-virginica
118	118	0.998655	virginica	virginica
145	145	0.897028	virginica	virginica
78	78	0.210752	non-virginica	non-virginica
127	127	0.585651	virginica	virginica
108	108	0.936985	virginica	virginica
55	55	0.150068	non-virginica	non-virginica
30	30	0.000009	non-virginica	non-virginica
29	29	0.000009	non-virginica	non-virginica
141	141	0.870248	virginica	virginica
110	110	0.782899	virginica	virginica
19	19	0.000009	non-virginica	non-virginica
132	132	0.952741	virginica	virginica
64	64	0.016507	non-virginica	non-virginica

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
26	5.0	3.4	1.6	0.4
18	5.7	3.8	1.7	0.3
118	7.7	2.6	6.9	2.3
145	6.7	3.0	5.2	2.3
78	6.0	2.9	4.5	1.5
127	6.1	3.0	4.9	1.8
108	6.7	2.5	5.8	1.8
55	5.7	2.8	4.5	1.3
30	4.8	3.1	1.6	0.2
29	4.7	3.2	1.6	0.2
141	6.9	3.1	5.1	2.3
110	6.5	3.2	5.1	2.0
19	5.1	3.8	1.5	0.3
132	6.4	2.8	5.6	2.2
64	5.6	2.9	3.6	1.3

	Instance number	Probability of predicting virginica	Actual prediction by the model	Ground truth
143	143	0.981925	virginica	virginica
56	56	0.356495	non-virginica	non-virginica
128	128	0.942506	virginica	virginica
69	69	0.023739	non-virginica	non-virginica
68	68	0.210752	non-virginica	non-virginica
82	82	0.029036	non-virginica	non-virginica
45	45	0.000007	non-virginica	non-virginica
131	131	0.990823	virginica	virginica
36	36	0.000004	non-virginica	non-virginica
73	73	0.194982	non-virginica	non-virginica
76	76	0.322391	non-virginica	non-virginica
104	104	0.971436	virginica	virginica
31	31	0.000011	non-virginica	non-virginica
9	9	0.000006	non-virginica	non-virginica
12	12	0.000004	non-virginica	non-virginica

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	target
0	5.1	3.5	1.4	0.2	0
1	4.9	3.0	1.4	0.2	0
2	4.7	3.2	1.3	0.2	0
3	4.6	3.1	1.5	0.2	0
4	5.0	3.6	1.4	0.2	0
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	2
146	6.3	2.5	5.0	1.9	2
147	6.5	3.0	5.2	2.0	2
148	6.2	3.4	5.4	2.3	2
149	5.9	3.0	5.1	1.8	2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
26	5.0	3.4	1.6	0.4
18	5.7	3.8	1.7	0.3
118	7.7	2.6	6.9	2.3
145	6.7	3.0	5.2	2.3
78	6.0	2.9	4.5	1.5
127	6.1	3.0	4.9	1.8
108	6.7	2.5	5.8	1.8
55	5.7	2.8	4.5	1.3
30	4.8	3.1	1.6	0.2
29	4.7	3.2	1.6	0.2
141	6.9	3.1	5.1	2.3
110	6.5	3.2	5.1	2.0
19	5.1	3.8	1.5	0.3
132	6.4	2.8	5.6	2.2
64	5.6	2.9	3.6	1.3

Framing the problem¶

Getting the data¶

Exploratory Data Analysis¶

Redefining the data¶

Exploring and Describing the data¶

Visualizing the Data¶

Let us first visualize our features individually.¶

Now let us visualize the features and targets against each other¶

Correlation Matrix¶

Data Preprocessing¶

Splitting the Data¶

Logistic Regression¶

Model Evaluation¶

Scoring the Models¶

Decision Boundary Graph¶

Failure Modes¶

Recommend the Best Model.¶

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	target
0	5.1	3.5	1.4	0.2	0
1	4.9	3.0	1.4	0.2	0
2	4.7	3.2	1.3	0.2	0
3	4.6	3.1	1.5	0.2	0
4	5.0	3.6	1.4	0.2	0
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	2
146	6.3	2.5	5.0	1.9	2
147	6.5	3.0	5.2	2.0	2
148	6.2	3.4	5.4	2.3	2
149	5.9	3.0	5.1	1.8	2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
26	5.0	3.4	1.6	0.4
18	5.7	3.8	1.7	0.3
118	7.7	2.6	6.9	2.3
145	6.7	3.0	5.2	2.3
78	6.0	2.9	4.5	1.5
127	6.1	3.0	4.9	1.8
108	6.7	2.5	5.8	1.8
55	5.7	2.8	4.5	1.3
30	4.8	3.1	1.6	0.2
29	4.7	3.2	1.6	0.2
141	6.9	3.1	5.1	2.3
110	6.5	3.2	5.1	2.0
19	5.1	3.8	1.5	0.3
132	6.4	2.8	5.6	2.2
64	5.6	2.9	3.6	1.3