# Importing Python libraries needed for data analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os
import math
import plotly.io as pio
pio.renderers.default='notebook'
from sklearn.cluster import KMeans
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
%matplotlib inline
# reading dataset
dataset = pd.read_excel('filename.xlsx').iloc[:,:-1]
! python --version
Python 3.7.12
dataset.shape
(189, 46)
# fixing columns spaces
cols=[]
for col in dataset.columns:
if col.startswith('Unnamed'):
cols.append(cols[-1])
else:
cols.append(col.strip())
dataset.columns=cols
dataset.columns=[i+'--'+str(z).strip() for i,z in zip(dataset.columns,dataset.iloc[0,:].fillna('').values)]
dataset.columns=[col.strip('--') for col in dataset.columns]
# Data transforming
dataset['Consciousness Time']=pd.to_numeric(dataset['Consciousness Time'],errors='coerce')
dataset.drop(0,inplace=True)
dataset.dropna(inplace=True)
dataset.reset_index(inplace=True,drop=True)
dataset.columns
Index(['Nr', 'Location', 'Sex', 'Age', 'Age onset', 'Years with ES', 'Seizure Type', 'Laterality', 'Behavior before', 'Same day ES before', 'ES before', 'Ictal Seconds', 'Ictal signs and symtoms--MA', 'Ictal signs and symtoms--OA', 'Ictal signs and symtoms--SMA', 'Ictal signs and symtoms--Laughing', 'Ictal signs and symtoms--Coughing', 'Ictal signs and symtoms--NRR', 'Ictal signs and symtoms--NRL', 'Ictal signs and symtoms--Vo', 'Ictal signs and symtoms--Gaze', 'Ictal signs and symtoms--VA', 'Ictal signs and symtoms--Hiccup', 'Consciousness Time', 'Postictal signs and symptoms--MA', 'Postictal signs and symptoms--OA', 'Postictal signs and symptoms--NRR', 'Postictal signs and symptoms--NRL', 'Postictal signs and symptoms--Smacking', 'Postictal signs and symptoms--Smile', 'Postictal signs and symptoms--Laughing', 'Postictal signs and symptoms--Coughing', 'Postictal signs and symptoms--Vo', 'Postictal signs and symptoms--Gape', 'Postictal signs and symptoms--Hipcup', 'Postictal signs and symptoms--Motor restless', 'Postictal signs and symptoms--Speaks incomprehensible', 'Postictal signs and symptoms--Cloni Arm', 'Postictal signs and symptoms--Stand up', 'Level of Consciousness', 'Coughing Time seconds--Coughing #1', 'Coughing Time seconds--Coughing #2', 'Coughing Time seconds--Coughing #3', 'Coughing Time seconds--Coughing #4', 'Disnomia seconds', 'Aphasia TT'], dtype='object')
# Examining the dataframe
dataset.head(25)
Nr | Location | Sex | Age | Age onset | Years with ES | Seizure Type | Laterality | Behavior before | Same day ES before | ... | Postictal signs and symptoms--Speaks incomprehensible | Postictal signs and symptoms--Cloni Arm | Postictal signs and symptoms--Stand up | Level of Consciousness | Coughing Time seconds--Coughing #1 | Coughing Time seconds--Coughing #2 | Coughing Time seconds--Coughing #3 | Coughing Time seconds--Coughing #4 | Disnomia seconds | Aphasia TT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | TR | 2.0 | 20.0 | 9.0 | 11.0 | 4.0 | 1.0 | 1.0 | 2.0 | ... | 0 | 0 | 0 | 2.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
1 | 2.0 | TR | 1.0 | 25.0 | 22.0 | 3.0 | 9.0 | 1.0 | 1.0 | 3.0 | ... | 0 | 0 | 0 | 5.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
2 | 3.0 | TR | 2.0 | 28.0 | 6.0 | 22.0 | 10.0 | 1.0 | 2.0 | 0.0 | ... | 0 | 0 | 0 | 2.0 | 39 | 0 | 0 | 0 | 0.0 | 0.0 |
3 | 4.0 | TR | 2.0 | 42.0 | 1.0 | 41.0 | 11.0 | 1.0 | 1.0 | 2.0 | ... | 0 | 0 | 0 | 5.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
4 | 5.0 | TR | 2.0 | 35.0 | 10.0 | 25.0 | 4.0 | 1.0 | 1.0 | 0.0 | ... | 0 | 0 | 0 | 2.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
5 | 6.0 | TR | 2.0 | 29.0 | 20.0 | 9.0 | 4.0 | 2.0 | 2.0 | 0.0 | ... | 0 | 0 | 0 | 4.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
6 | 7.0 | TR | 1.0 | 57.0 | 3.0 | 54.0 | 4.0 | 1.0 | 1.0 | 0.0 | ... | 0 | 0 | 0 | 2.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
7 | 8.0 | TR | 1.0 | 24.0 | 4.0 | 20.0 | 2.0 | 1.0 | 1.0 | 2.0 | ... | 0 | 0 | 0 | 2.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
8 | 9.0 | TR | 1.0 | 34.0 | 7.0 | 27.0 | 2.0 | 1.0 | 1.0 | 4.0 | ... | 0 | 0 | 0 | 2.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
9 | 10.0 | TR | 2.0 | 42.0 | 1.0 | 41.0 | 2.0 | 1.0 | 1.0 | 4.0 | ... | 0 | 1 | 0 | 5.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
10 | 11.0 | TR | 1.0 | 22.0 | 13.0 | 9.0 | 2.0 | 1.0 | 2.0 | 0.0 | ... | 0 | 0 | 0 | 2.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
11 | 12.0 | TR | 2.0 | 20.0 | 10.0 | 10.0 | 2.0 | 1.0 | 2.0 | 1.0 | ... | 0 | 0 | 0 | 2.0 | 2 | 0 | 0 | 0 | 0.0 | 0.0 |
12 | 13.0 | TR | 1.0 | 19.0 | 14.0 | 5.0 | 2.0 | 1.0 | 2.0 | 1.0 | ... | 0 | 0 | 0 | 2.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
13 | 14.0 | TR | 2.0 | 21.0 | 10.0 | 11.0 | 6.0 | 2.0 | 1.0 | 1.0 | ... | 0 | 0 | 0 | 2.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
14 | 15.0 | TR | 1.0 | 43.0 | 16.0 | 27.0 | 7.0 | 2.0 | 1.0 | 0.0 | ... | 0 | 0 | 0 | 4.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
15 | 16.0 | TR | 2.0 | 20.0 | 10.0 | 10.0 | 3.0 | 1.0 | 1.0 | 0.0 | ... | 0 | 0 | 0 | 10.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
16 | 18.0 | TR | 1.0 | 61.0 | 30.0 | 31.0 | 1.0 | 1.0 | 2.0 | 2.0 | ... | 0 | 0 | 0 | 2.0 | 0 | 0 | 0 | 0 | 46.0 | 0.0 |
17 | 19.0 | TR | 2.0 | 20.0 | 9.0 | 11.0 | 1.0 | 1.0 | 1.0 | 5.0 | ... | 0 | 0 | 0 | 5.0 | 0 | 0 | 0 | 0 | 0.0 | 60.0 |
18 | 20.0 | TR | 1.0 | 24.0 | 23.0 | 1.0 | 1.0 | 1.0 | 2.0 | 6.0 | ... | 0 | 0 | 0 | 2.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
19 | 21.0 | TR | 2.0 | 30.0 | 16.0 | 14.0 | 1.0 | 1.0 | 1.0 | 1.0 | ... | 0 | 0 | 0 | 4.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
20 | 24.0 | TR | 2.0 | 18.0 | 16.0 | 2.0 | 1.0 | 1.0 | 1.0 | 1.0 | ... | 0 | 0 | 0 | 2.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
21 | 25.0 | TR | 2.0 | 53.0 | 51.0 | 2.0 | 1.0 | 1.0 | 1.0 | 1.0 | ... | 0 | 0 | 0 | 2.0 | 7 | 0 | 0 | 0 | 0.0 | 0.0 |
22 | 26.0 | TR | 1.0 | 24.0 | 22.0 | 2.0 | 1.0 | 1.0 | 1.0 | 1.0 | ... | 0.083333 | 0.125 | 0.166667 | 2.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
23 | 27.0 | TR | 2.0 | 53.0 | 14.0 | 39.0 | 1.0 | 1.0 | 2.0 | 0.0 | ... | 1 | 0 | 0 | 2.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
24 | 28.0 | TR | 1.0 | 52.0 | 1.0 | 51.0 | 1.0 | 3.0 | 1.0 | 1.0 | ... | 0 | 0 | 0 | 4.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
25 rows × 46 columns
# scatterplot graph , features transforming
df1=dataset.loc[:,'Sex':'Ictal Seconds']
for col in df1.columns[1:-1]:
df1[col]=df1[col].astype(int).astype(str)
df1['Age']=df1['Age'].astype(int)
df1['Consciousness Time']=pd.to_numeric(dataset['Consciousness Time'])
sns.set(rc={'figure.figsize':(20,20)})
g = sns.FacetGrid(df1, col="Sex", height=8.27, aspect=11.7/8.27)
g.map(sns.scatterplot, 'Consciousness Time', 'Age', alpha=.7)
g.add_legend()
<seaborn.axisgrid.FacetGrid at 0x1d080c16fa0>
g = sns.FacetGrid(df1, col="Laterality", height=4, aspect=.5)
g.map(sns.barplot, "Sex", "Consciousness Time")
C:\Users\tommy\anaconda3\lib\site-packages\seaborn\axisgrid.py:670: UserWarning: Using the barplot function without specifying `order` is likely to produce an incorrect plot.
<seaborn.axisgrid.FacetGrid at 0x1d0814e39a0>
g = sns.FacetGrid(df1, col="Behavior before", height=4, aspect=.5)
g.map(sns.barplot, "Sex", "Consciousness Time")
C:\Users\tommy\anaconda3\lib\site-packages\seaborn\axisgrid.py:670: UserWarning: Using the barplot function without specifying `order` is likely to produce an incorrect plot.
<seaborn.axisgrid.FacetGrid at 0x1d08198d520>
##age wise distribution plot
g = sns.FacetGrid(df1, row="Sex",
height=1.7, aspect=4,)
g.map(sns.kdeplot, "Consciousness Time")
<seaborn.axisgrid.FacetGrid at 0x1d0819a0bb0>
df=dataset.loc[:,'Sex':'Consciousness Time']
corr_df=df.corr()
fig = px.imshow(corr_df)
fig.update_layout(title='Correlation comparion for main features',width=1000,height=1000
)
fig.show()
# fig.write_html(r'D:\Machine learning\Epilepsia SUSANA\correlacion.html')
#with open("correlacion.html", "r", encoding='utf-8') as f:
# text= f.read()
import codecs
# f = codecs.open("correlacion.html", 'r', 'utf-8')
def age_group(x):
if x<=20:
return '<21'
elif 20<x<=29:
return '20-29'
elif 29<x<=39:
return '29-39'
elif 39<x<=49:
return '39-49'
elif 49<x<=59:
return'49-59'
else:
return '60+'
df['Age_group']=df['Age'].apply(age_group)
df_age_group=df.groupby('Age_group')['Age'].count().reset_index().rename(columns={'Age':'count'})
df_age_group.head(10)
Age_group | count | |
---|---|---|
0 | 20-29 | 35 |
1 | 29-39 | 38 |
2 | 39-49 | 49 |
3 | 49-59 | 27 |
4 | 60+ | 19 |
5 | <21 | 16 |
fig = px.bar(df_age_group, x="Age_group", y="count",color="Age_group")
fig.update_layout(
title="Number of patients in each group",
)
fig.show()
##dropping age and encoding age groups
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df['Age_group']=encoder.fit_transform(df['Age_group'])
df.drop(columns=['Age'],inplace=True)
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(df.iloc[:,:-1], df.iloc[:,-1], test_size = 0.25, random_state = 42)
# data shape for test and training subsets
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)
Training Features Shape: (138, 21) Training Labels Shape: (138,) Testing Features Shape: (46, 21) Testing Labels Shape: (46,)
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))
Mean Absolute Error: 0.91
# Index sort the most important features
sorted_feature_weight_idxes = np.argsort(rf.feature_importances_)[::-1] # Reverse sort
# Get the most important features names and weights
most_important_features = np.take_along_axis(
np.array(df.columns.tolist()),
sorted_feature_weight_idxes, axis=0)
most_important_weights = np.take_along_axis(
np.array(rf.feature_importances_),
sorted_feature_weight_idxes, axis=0)
##feature importances for prediction
list(zip(most_important_features, most_important_weights))
[('Years with ES', 0.3596060876417491), ('Age onset', 0.21197770229740107), ('ES before', 0.13750612173869242), ('Ictal Seconds', 0.08368598620142682), ('Consciousness Time', 0.04445168755392684), ('Seizure Type', 0.04428449342427326), ('Ictal signs and symtoms--MA', 0.036814401126194496), ('Same day ES before', 0.026543817597985287), ('Ictal signs and symtoms--Vo', 0.010969131970429053), ('Laterality', 0.010028266107102828), ('Behavior before', 0.008891861405386075), ('Sex', 0.007201983453059538), ('Ictal signs and symtoms--Laughing', 0.0065720616668258755), ('Ictal signs and symtoms--OA', 0.004895333068464376), ('Ictal signs and symtoms--SMA', 0.0029995569551857304), ('Ictal signs and symtoms--Coughing', 0.002668113593882296), ('Ictal signs and symtoms--NRR', 0.00043683954251242367), ('Ictal signs and symtoms--Hiccup', 0.00030665566347807036), ('Ictal signs and symtoms--NRL', 0.00010629466614981067), ('Ictal signs and symtoms--Gaze', 5.360432587465697e-05), ('Ictal signs and symtoms--VA', 0.0)]
import plotly.express as px
##creating a tree map with location, sex , age .This shows the value count or number of patients in each group of location, age and sex
treedata=dataset.copy()
for col in ['Sex', 'Age', 'Age onset',]:
treedata[col]=treedata[col].astype(int).astype(str)
treedata=treedata.groupby(['Location', 'Sex', 'Age'])['Nr'].count().reset_index().rename(columns={'Nr':'count'})
fig = px.treemap(treedata, path=['Location', 'Sex', 'Age'], values='count')
fig.update_layout(autosize=False,width=800,height=700)
fig.show()
# fig.write_html(r'D:\Machine learning\Epilepsia SUSANA\treemap.html')
treedata.columns
Index(['Location', 'Sex', 'Age', 'count'], dtype='object')
# Kmeans preparation
from sklearn.preprocessing import LabelEncoder
df=dataset.loc[:,'Location':'Ictal Seconds']
data2=df
encoder=LabelEncoder()
data2['Location']=encoder.fit_transform(data2['Location'])
data2
Location | Sex | Age | Age onset | Years with ES | Seizure Type | Laterality | Behavior before | Same day ES before | ES before | Ictal Seconds | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | 2.0 | 20.0 | 9.0 | 11.0 | 4.0 | 1.0 | 1.0 | 2.0 | 6.0 | 91.0 |
1 | 3 | 1.0 | 25.0 | 22.0 | 3.0 | 9.0 | 1.0 | 1.0 | 3.0 | 3.0 | 98.0 |
2 | 3 | 2.0 | 28.0 | 6.0 | 22.0 | 10.0 | 1.0 | 2.0 | 0.0 | 0.0 | 90.0 |
3 | 3 | 2.0 | 42.0 | 1.0 | 41.0 | 11.0 | 1.0 | 1.0 | 2.0 | 2.0 | 197.0 |
4 | 3 | 2.0 | 35.0 | 10.0 | 25.0 | 4.0 | 1.0 | 1.0 | 0.0 | 0.0 | 122.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
179 | 0 | 1.0 | 13.0 | 5.0 | 8.0 | 57.0 | 1.0 | 2.0 | 7.0 | 8.0 | 28.0 |
180 | 0 | 2.0 | 26.0 | 2.0 | 24.0 | 65.0 | 2.0 | 2.0 | 2.0 | 3.0 | 122.0 |
181 | 0 | 1.0 | 33.0 | 29.0 | 4.0 | 74.0 | 1.0 | 1.0 | 3.0 | 3.0 | 214.0 |
182 | 0 | 2.0 | 56.0 | 7.0 | 49.0 | 83.0 | 1.0 | 1.0 | 7.0 | 23.0 | 66.0 |
183 | 0 | 1.0 | 21.0 | 4.0 | 17.0 | 82.0 | 1.0 | 1.0 | 4.0 | 4.0 | 54.0 |
184 rows × 11 columns
# finding the best K value to the appropiate number of clusters
inertias = []
K = range(1,10)
plt.figure(figsize=(12,8))
## finding k with other attributes
for k in K:
#Building and fitting the model
kmeanModel = KMeans(n_clusters=k).fit(data2)
kmeanModel.fit(data2)
inertias.append(kmeanModel.inertia_)
plt.plot(K, inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('The Elbow Method using Inertia')
C:\Users\tommy\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:881: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. C:\Users\tommy\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:881: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
Text(0.5, 1.0, 'The Elbow Method using Inertia')
# the elbow point is coming at 4
# Fitting K-Means to the dataset
from sklearn.cluster import KMeans
## the number of clusters is set of 4
kmeans = KMeans(n_clusters = 4, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(data2)
y_kmeans1=y_kmeans+1
cluster = pd.DataFrame(y_kmeans1)
# Adding cluster to the Dataset
dataset['cluster'] = cluster
data2['cluster'] = cluster
dataset
Nr | Location | Sex | Age | Age onset | Years with ES | Seizure Type | Laterality | Behavior before | Same day ES before | ... | Postictal signs and symptoms--Cloni Arm | Postictal signs and symptoms--Stand up | Level of Consciousness | Coughing Time seconds--Coughing #1 | Coughing Time seconds--Coughing #2 | Coughing Time seconds--Coughing #3 | Coughing Time seconds--Coughing #4 | Disnomia seconds | Aphasia TT | cluster | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | TR | 2.0 | 20.0 | 9.0 | 11.0 | 4.0 | 1.0 | 1.0 | 2.0 | ... | 0 | 0 | 2.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 1 |
1 | 2.0 | TR | 1.0 | 25.0 | 22.0 | 3.0 | 9.0 | 1.0 | 1.0 | 3.0 | ... | 0 | 0 | 5.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 1 |
2 | 3.0 | TR | 2.0 | 28.0 | 6.0 | 22.0 | 10.0 | 1.0 | 2.0 | 0.0 | ... | 0 | 0 | 2.0 | 39 | 0 | 0 | 0 | 0.0 | 0.0 | 1 |
3 | 4.0 | TR | 2.0 | 42.0 | 1.0 | 41.0 | 11.0 | 1.0 | 1.0 | 2.0 | ... | 0 | 0 | 5.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 4 |
4 | 5.0 | TR | 2.0 | 35.0 | 10.0 | 25.0 | 4.0 | 1.0 | 1.0 | 0.0 | ... | 0 | 0 | 2.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
179 | 184.0 | FL | 1.0 | 13.0 | 5.0 | 8.0 | 57.0 | 1.0 | 2.0 | 7.0 | ... | 0 | 0 | 2.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 2 |
180 | 185.0 | FL | 2.0 | 26.0 | 2.0 | 24.0 | 65.0 | 2.0 | 2.0 | 2.0 | ... | 0 | 0 | 2.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 4 |
181 | 186.0 | FL | 1.0 | 33.0 | 29.0 | 4.0 | 74.0 | 1.0 | 1.0 | 3.0 | ... | 0 | 0 | 2.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 4 |
182 | 187.0 | FL | 2.0 | 56.0 | 7.0 | 49.0 | 83.0 | 1.0 | 1.0 | 7.0 | ... | 0 | 0 | 2.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 2 |
183 | 188.0 | FL | 1.0 | 21.0 | 4.0 | 17.0 | 82.0 | 1.0 | 1.0 | 4.0 | ... | 0 | 0 | 11.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 2 |
184 rows × 47 columns
##printing patients in each cluster
for i in range(1,5):
print('****ID of patients in cluster {}*****'.format(str(i)))
print(list(dataset[dataset.cluster==i]['Nr'].values))
****ID of patients in cluster 1***** [1.0, 2.0, 3.0, 5.0, 6.0, 7.0, 12.0, 14.0, 19.0, 28.0, 29.0, 30.0, 38.0, 43.0, 49.0, 50.0, 51.0, 52.0, 54.0, 72.0, 73.0, 86.0, 89.0, 90.0, 93.0, 95.0, 96.0, 98.0, 99.0, 109.0, 110.0, 114.0, 115.0, 118.0, 120.0, 123.0, 125.0, 128.0, 130.0, 138.0, 141.0, 143.0, 145.0, 172.0, 174.0] ****ID of patients in cluster 2***** [48.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 102.0, 147.0, 148.0, 149.0, 150.0, 151.0, 152.0, 153.0, 154.0, 155.0, 156.0, 157.0, 158.0, 159.0, 160.0, 161.0, 162.0, 164.0, 165.0, 166.0, 167.0, 168.0, 169.0, 176.0, 177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 183.0, 184.0, 187.0, 188.0] ****ID of patients in cluster 3***** [8.0, 9.0, 11.0, 13.0, 18.0, 20.0, 21.0, 24.0, 25.0, 26.0, 27.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 39.0, 40.0, 41.0, 42.0, 44.0, 46.0, 47.0, 53.0, 55.0, 67.0, 68.0, 69.0, 70.0, 71.0, 85.0, 88.0, 91.0, 92.0, 94.0, 100.0, 103.0, 105.0, 106.0, 107.0, 108.0, 111.0, 112.0, 113.0, 116.0, 117.0, 121.0, 122.0, 124.0, 126.0, 127.0, 129.0, 131.0, 132.0, 133.0, 134.0, 135.0, 136.0, 137.0, 139.0, 140.0, 142.0, 170.0, 171.0, 173.0, 175.0] ****ID of patients in cluster 4***** [4.0, 10.0, 15.0, 16.0, 45.0, 83.0, 84.0, 87.0, 97.0, 101.0, 104.0, 144.0, 146.0, 163.0, 185.0, 186.0]
print(data2['cluster'].value_counts())
3 68 2 55 1 45 4 16 Name: cluster, dtype: int64
# Train a classifier
from sklearn.ensemble import RandomForestClassifier
def forestmodel(df):
clf = RandomForestClassifier(random_state=1)
clf.fit(df.drop(columns=["Binary Cluster 0","cluster"]).values, df["Binary Cluster 0"].values)
# Index sort the most important features
sorted_feature_weight_idxes = np.argsort(clf.feature_importances_)[::-1] # Reverse sort
# Get the most important features names and weights
most_important_features = np.take_along_axis(
np.array(df.columns.tolist()),
sorted_feature_weight_idxes, axis=0)
most_important_weights = np.take_along_axis(
np.array(clf.feature_importances_),
sorted_feature_weight_idxes, axis=0)
# Show
return list(zip(most_important_features, most_important_weights))
for i in range(1,5):
data2['Binary Cluster 0'] = np.where(data2['cluster']==i,1,0)
print(f'## the feature importances for cluster {i}\n')
feat_imp=forestmodel(data2)
print(feat_imp)
## the feature importances for cluster 1 [('Ictal Seconds', 0.404402195843408), ('Seizure Type', 0.1610039480546899), ('Years with ES', 0.1276797177677198), ('Age onset', 0.08778215739327966), ('Age', 0.0777674326261514), ('ES before', 0.04128598658738516), ('Same day ES before', 0.02772220409778277), ('Sex', 0.02551892861854519), ('Location', 0.018516281311191766), ('Behavior before', 0.014473893632501296), ('Laterality', 0.013847254067345167)] ## the feature importances for cluster 2 [('Seizure Type', 0.6493457211279573), ('Ictal Seconds', 0.09709878225974283), ('Age', 0.05026043784401173), ('Age onset', 0.04539125339151699), ('Years with ES', 0.0427693759958361), ('ES before', 0.036646011785206344), ('Location', 0.032323646121786974), ('Same day ES before', 0.019077410308894236), ('Sex', 0.01129771735738162), ('Laterality', 0.00849939737145132), ('Behavior before', 0.007290246436214663)] ## the feature importances for cluster 3 [('Seizure Type', 0.352666356263936), ('Ictal Seconds', 0.34537009442344474), ('Years with ES', 0.06980027733210667), ('Age onset', 0.054839934256800026), ('ES before', 0.05242327124633987), ('Age', 0.0490743244766378), ('Location', 0.022922807340597446), ('Same day ES before', 0.021918581327734516), ('Sex', 0.012786904801336402), ('Laterality', 0.011272816830795599), ('Behavior before', 0.0069246317002709425)] ## the feature importances for cluster 4 [('Ictal Seconds', 0.631586373222727), ('Age', 0.07857557055109868), ('Years with ES', 0.06600087497427046), ('Age onset', 0.05385883838384985), ('Seizure Type', 0.05169409500700529), ('ES before', 0.04437213030496303), ('Location', 0.02152791471214902), ('Sex', 0.0197223252255728), ('Same day ES before', 0.016975537050294386), ('Laterality', 0.008952620771478317), ('Behavior before', 0.006733719796591151)]
Agglomerative hierarchical clustering differs from k-means in a key way. Rather than choosing a number of clusters and starting out with random centroids, we instead begin with every point in our dataset as a “cluster.” Then we find the two closest points and combine them into a cluster. Then, we find the next closest points, and those become a cluster. We repeat the process until we only have one big giant cluster.
# create dendrogram
plt.figure(figsize=(15,8))
dendrogram = sch.dendrogram(sch.linkage(data2, method='ward'))
Now we know the number of clusters for our dataset, the next step is to group the data points into these four clusters. To do so we will use the AgglomerativeClustering class of the sklearn.cluster library. Take a look at the following script
from sklearn.cluster import AgglomerativeClustering
cluster = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='ward')
cluster.fit_predict(data2)
array([0, 0, 0, 1, 0, 0, 0, 3, 3, 1, 3, 0, 3, 0, 1, 1, 3, 0, 3, 3, 3, 0, 3, 3, 0, 0, 0, 3, 3, 3, 0, 3, 0, 3, 0, 3, 3, 3, 0, 0, 3, 1, 3, 3, 2, 2, 2, 0, 0, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 3, 0, 1, 3, 0, 0, 3, 3, 0, 0, 0, 0, 1, 0, 2, 3, 1, 2, 0, 1, 3, 0, 3, 3, 0, 3, 3, 3, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 3, 3, 0, 0, 0, 3, 0, 3, 0, 3, 0, 3, 3, 3, 0, 0, 3, 0, 3, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 0, 3, 0, 3, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2], dtype=int64)
dataset['cluster']=cluster.labels_+1
##printing patients in each cluster got by heirarchial clustering
for i in range(1,5):
print('****ID of patients in cluster {}*****'.format(str(i)))
print(list(dataset[dataset.cluster==i]['Nr'].values))
****ID of patients in cluster 1***** [1.0, 2.0, 3.0, 5.0, 6.0, 7.0, 12.0, 14.0, 19.0, 25.0, 28.0, 29.0, 30.0, 34.0, 36.0, 38.0, 42.0, 43.0, 51.0, 52.0, 53.0, 55.0, 72.0, 73.0, 86.0, 89.0, 90.0, 93.0, 94.0, 95.0, 96.0, 98.0, 103.0, 106.0, 109.0, 114.0, 115.0, 116.0, 118.0, 120.0, 121.0, 123.0, 126.0, 127.0, 128.0, 130.0, 132.0, 134.0, 138.0, 139.0, 141.0, 172.0, 174.0, 176.0] ****ID of patients in cluster 2***** [4.0, 10.0, 15.0, 16.0, 45.0, 84.0, 87.0, 97.0, 101.0, 104.0, 144.0, 146.0, 186.0] ****ID of patients in cluster 3***** [48.0, 49.0, 50.0, 54.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 99.0, 102.0, 143.0, 145.0, 147.0, 148.0, 149.0, 150.0, 151.0, 152.0, 153.0, 154.0, 155.0, 156.0, 157.0, 158.0, 159.0, 160.0, 161.0, 162.0, 163.0, 164.0, 165.0, 166.0, 167.0, 168.0, 169.0, 177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 183.0, 184.0, 185.0, 187.0, 188.0] ****ID of patients in cluster 4***** [8.0, 9.0, 11.0, 13.0, 18.0, 20.0, 21.0, 24.0, 26.0, 27.0, 31.0, 32.0, 33.0, 35.0, 37.0, 39.0, 40.0, 41.0, 44.0, 46.0, 47.0, 67.0, 68.0, 69.0, 70.0, 71.0, 85.0, 88.0, 91.0, 92.0, 100.0, 105.0, 107.0, 108.0, 110.0, 111.0, 112.0, 113.0, 117.0, 122.0, 124.0, 125.0, 129.0, 131.0, 133.0, 135.0, 136.0, 137.0, 140.0, 142.0, 170.0, 171.0, 173.0, 175.0]
from sklearn.feature_selection import SelectKBest, chi2, f_regression
##from the dataset we take best 10 features that can correctly find consciousness time
df=dataset
encoder=LabelEncoder()
df['Location']=encoder.fit_transform(df['Location'])
X=df.drop(columns=['Nr','Consciousness Time'])
y=df['Consciousness Time']
# Create the object for SelectKBest and fit and transform the regression data
X_reg_new=SelectKBest(score_func=f_regression, k=10).fit_transform(X,y)
##the best 10 features to predict consciouous time
X_reg_new=pd.DataFrame(X_reg_new)
for col in X_reg_new.columns:
for col1 in dataset.columns:
if all(dataset[col1]==X_reg_new[col]):
print(col1)
Years with ES Same day ES before ES before Postictal signs and symptoms--OA Postictal signs and symptoms--NRR Postictal signs and symptoms--Smacking Postictal signs and symptoms--Hipcup Level of Consciousness Disnomia seconds Aphasia TT
#first check high incidence for popular attributes like age, sex etc
df1=dataset[['cluster','Location', 'Sex', 'Age', 'Age onset', 'Years with ES',
'Seizure Type', ]]
df1['group_incidence'] = df1.groupby(['cluster','Location', 'Sex', 'Age', 'Age onset', 'Years with ES'])['cluster'].transform('size') / len(df)
C:\Users\tommy\AppData\Local\Temp/ipykernel_3456/3965849163.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
#we can see the most common combination for the occurance, sorted
##this means that most commonly repeated age, sex and other groups of patients are shown
df1.sort_values('group_incidence',ascending=False).head(10)
# df1.sort_values('group_incidence',ascending=False).head(10)
cluster | Location | Sex | Age | Age onset | Years with ES | Seizure Type | group_incidence | |
---|---|---|---|---|---|---|---|---|
105 | 1 | 2 | 2.0 | 68.0 | 9.0 | 59.0 | 1.0 | 0.016304 |
163 | 3 | 2 | 1.0 | 47.0 | 35.0 | 12.0 | 82.0 | 0.016304 |
146 | 3 | 2 | 1.0 | 47.0 | 35.0 | 12.0 | 40.0 | 0.016304 |
157 | 3 | 2 | 1.0 | 47.0 | 35.0 | 12.0 | 74.0 | 0.016304 |
179 | 3 | 0 | 1.0 | 13.0 | 5.0 | 8.0 | 57.0 | 0.016304 |
85 | 1 | 2 | 2.0 | 68.0 | 9.0 | 59.0 | 2.0 | 0.016304 |
125 | 1 | 2 | 2.0 | 68.0 | 9.0 | 59.0 | 1.0 | 0.016304 |
177 | 3 | 0 | 1.0 | 13.0 | 5.0 | 8.0 | 49.0 | 0.016304 |
176 | 3 | 0 | 1.0 | 13.0 | 5.0 | 8.0 | 48.0 | 0.016304 |
128 | 4 | 2 | 1.0 | 51.0 | 1.0 | 50.0 | 23.0 | 0.010870 |
df1.sort_values('group_incidence',ascending=False).tail(10)
cluster | Location | Sex | Age | Age onset | Years with ES | Seizure Type | group_incidence | |
---|---|---|---|---|---|---|---|---|
62 | 3 | 3 | 1.0 | 51.0 | 35.0 | 16.0 | 82.0 | 0.005435 |
70 | 3 | 1 | 1.0 | 29.0 | 15.0 | 14.0 | 41.0 | 0.005435 |
63 | 4 | 1 | 1.0 | 46.0 | 6.0 | 40.0 | 1.0 | 0.005435 |
64 | 4 | 1 | 1.0 | 51.0 | 1.0 | 50.0 | 1.0 | 0.005435 |
65 | 4 | 1 | 1.0 | 52.0 | 14.0 | 38.0 | 22.0 | 0.005435 |
66 | 4 | 1 | 1.0 | 19.0 | 10.0 | 9.0 | 25.0 | 0.005435 |
67 | 4 | 1 | 1.0 | 28.0 | 9.0 | 19.0 | 26.0 | 0.005435 |
68 | 1 | 1 | 2.0 | 44.0 | 38.0 | 6.0 | 26.0 | 0.005435 |
69 | 1 | 1 | 1.0 | 16.0 | 2.0 | 14.0 | 37.0 | 0.005435 |
183 | 3 | 0 | 1.0 | 21.0 | 4.0 | 17.0 | 82.0 | 0.005435 |