import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
dataframe = pd.read_csv(r"https://www.juanbarrios.com/descargas/cardiopatia_generated.csv")
X = np.array(dataframe.drop(['cardiopatia'],1))
y = np.array(dataframe['cardiopatia'])
# para hacer los datos sintéticos se utilizó el método bootstrap
C:\Users\tommy\AppData\Local\Temp\ipykernel_55596\3301660309.py:2: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only. X = np.array(dataframe.drop(['cardiopatia'],1))
# Se listan los primeros 5 filas o las ultimas cinco
dataframe.head(15)
cardiopatia | Sexe | Tabac | HTA | Diabetis | obesitat | edat_agrup | |
---|---|---|---|---|---|---|---|
0 | 2 | 2 | 2 | 1 | 2 | 2 | 3 |
1 | 2 | 1 | 1 | 1 | 2 | 4 | |
2 | 1 | 1 | 2 | 1 | 2 | 1 | 4 |
3 | 1 | 2 | 2 | 2 | 2 | 2 | 2 |
4 | 2 | 2 | 3 | 2 | 2 | 2 | 3 |
5 | 1 | 2 | 3 | 2 | 2 | 2 | 3 |
6 | 2 | 1 | 2 | 1 | 2 | 0 | 4 |
7 | 2 | 1 | 2 | 1 | 2 | 1 | 4 |
8 | 2 | 2 | 0 | 1 | 1 | 2 | 4 |
9 | 1 | 2 | 3 | 2 | 2 | 2 | 3 |
10 | 2 | 1 | 1 | 1 | 2 | 1 | 3 |
11 | 2 | 2 | 1 | 1 | 1 | 1 | 3 |
12 | 2 | 1 | 1 | 2 | 2 | 2 | 3 |
13 | 1 | 2 | 2 | 2 | 2 | 2 | 3 |
14 | 2 | 1 | 3 | 2 | 2 | 2 | 3 |
dataframe.tail( 5)
cardiopatia | Sexe | Tabac | HTA | Diabetis | obesitat | edat_agrup | |
---|---|---|---|---|---|---|---|
1995 | 2 | 1 | 1 | 1 | 2 | 4 | |
1996 | 1 | 1 | 3 | 1 | 2 | 2 | 4 |
1997 | 2 | 2 | 3 | 2 | 2 | 2 | 3 |
1998 | 1 | 2 | 3 | 2 | 2 | 1 | 3 |
1999 | 1 | 2 | 1 | 2 | 2 | 2 | 2 |
dataframe.shape
(2000, 7)
dataframe.dtypes
cardiopatia int64 Sexe int64 Tabac object HTA object Diabetis object obesitat object edat_agrup int64 dtype: object
dataframe=dataframe.replace(r'\s+', 0, regex=True)
dataframe['Tabac']=dataframe['Tabac'].astype(int)
dataframe['HTA']=dataframe['HTA'].astype(int)
dataframe['Diabetis']=dataframe['Diabetis'].astype(int)
dataframe['obesitat']=dataframe['obesitat'].astype(int)
dataframe.head(20)
cardiopatia | Sexe | Tabac | HTA | Diabetis | obesitat | edat_agrup | |
---|---|---|---|---|---|---|---|
0 | 2 | 2 | 2 | 1 | 2 | 2 | 3 |
1 | 2 | 1 | 1 | 1 | 2 | 0 | 4 |
2 | 1 | 1 | 2 | 1 | 2 | 1 | 4 |
3 | 1 | 2 | 2 | 2 | 2 | 2 | 2 |
4 | 2 | 2 | 3 | 2 | 2 | 2 | 3 |
5 | 1 | 2 | 3 | 2 | 2 | 2 | 3 |
6 | 2 | 1 | 2 | 1 | 2 | 0 | 4 |
7 | 2 | 1 | 2 | 1 | 2 | 1 | 4 |
8 | 2 | 2 | 0 | 1 | 1 | 2 | 4 |
9 | 1 | 2 | 3 | 2 | 2 | 2 | 3 |
10 | 2 | 1 | 1 | 1 | 2 | 1 | 3 |
11 | 2 | 2 | 1 | 1 | 1 | 1 | 3 |
12 | 2 | 1 | 1 | 2 | 2 | 2 | 3 |
13 | 1 | 2 | 2 | 2 | 2 | 2 | 3 |
14 | 2 | 1 | 3 | 2 | 2 | 2 | 3 |
15 | 1 | 1 | 2 | 1 | 2 | 2 | 4 |
16 | 1 | 1 | 1 | 2 | 2 | 2 | 2 |
17 | 1 | 2 | 3 | 2 | 2 | 2 | 3 |
18 | 2 | 1 | 2 | 1 | 2 | 1 | 4 |
19 | 2 | 2 | 0 | 1 | 2 | 2 | 4 |
# Dividir los datos en conjuntos de entrenamiento y prueba
# Aca se definen los 4 grupos de datos: test y train tanto para X como para y
X = dataframe.drop(['cardiopatia'], axis=1)
y = dataframe['cardiopatia']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 80% for training, 20% for testing
print(dataframe.groupby('cardiopatia').size())
cardiopatia 1 735 2 1265 dtype: int64
# Acá se le asigna a las clases un diccionario especifico
unique_classes, class_counts = np.unique(y, return_counts=True)
class_weight_best = {class_id: count for class_id, count in zip(unique_classes, class_counts)}
print(class_weight_best)
{1: 735, 2: 1265}
dataframe.hist()
plt.show()
#sb.pairplot(dataframe.dropna(), hue='cardiopatia',height=4,vars=["Sexe", "edat_agrup","Tabac","HTA","Diabetis","obesitat"],kind='reg')
dataframe.dtypes
cardiopatia int64 Sexe int64 Tabac int32 HTA int32 Diabetis int32 obesitat int32 edat_agrup int64 dtype: object
logistic_model = LogisticRegression(class_weight=class_weight_best)
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)
y_pred_train = logistic_model.predict(X_train)
y_pred_test = logistic_model.predict(X_test)
accuracy_train_logistic = accuracy_score(y_train, y_pred_train)
accuracy_test_logistic = accuracy_score(y_test, y_pred_test)
print("Modelo de regresión logística set de entrenamiento:", accuracy_train_logistic)
print("modelo de religión logística set de test", accuracy_test_logistic)
Modelo de regresión logística set de entrenamiento: 0.634375 modelo de religión logística set de test 0.6225
# Matriz de confusion
y_pred = logistic_model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
[[ 24 126] [ 25 225]]
matriz_confusion = confusion_matrix(y_test, y_pred)
def imprimir_matriz_confusion(matriz):
VN, FP = matriz[0]
FN, VP = matriz[1]
error_tipo_1 = FP / (FP + VN)
error_tipo_2 = FN / (FN + VP)
print(f"Verdaderos Negativos (VN): {VN}")
print(f"Falsos Positivos (FP): {FP}")
print(f"Falsos Negativos (FN): {FN}")
print(f"Verdaderos Positivos (VP): {VP}\n")
print(f"Error Tipo 1: {error_tipo_1:.2f}")
print(f"Error Tipo 2: {error_tipo_2:.2f}")
imprimir_matriz_confusion(matriz_confusion)
Verdaderos Negativos (VN): 24 Falsos Positivos (FP): 126 Falsos Negativos (FN): 25 Verdaderos Positivos (VP): 225 Error Tipo 1: 0.84 Error Tipo 2: 0.10
# Reporte de clasificación
print(classification_report(y_test, y_pred))
precision recall f1-score support 1 0.49 0.16 0.24 150 2 0.64 0.90 0.75 250 accuracy 0.62 400 macro avg 0.57 0.53 0.49 400 weighted avg 0.58 0.62 0.56 400
# Calcular la precisión usando model.score
score = logistic_model.score(X_test, y_test)
print(f"Precisión usando model.score: {score}")
# Calcular la precisión usando accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Precisión usando accuracy_score: {accuracy}")
Precisión usando model.score: 0.6225 Precisión usando accuracy_score: 0.6225
Aunque ambos metodos: accuracy_score y model.score calculan la misma métrica (precisión del modelo), la principal diferencia es cómo se usan. Mientras que accuracy_score toma como entrada las etiquetas verdaderas y las predicciones hechas por el modelo, model.score toma las características y las etiquetas verdaderas y realiza las predicciones internamente antes de calcular la precisión.
# otras librerias especificas para los arboles
plt.rcParams['figure.figsize'] = (16, 16)
plt.style.use('ggplot')
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image, ImageDraw, ImageFont
## aplicamos el modelo de validación cruzada con grid search
from sklearn.model_selection import GridSearchCV
decision_tree = tree.DecisionTreeClassifier
# Definir los rangos de valores para min_samples_split y min_samples_leaf
min_samples_split_range = range(2, 50, 5)
min_samples_leaf_range = range(2, 50, 5)
# Definir el modelo base y los parámetros para la búsqueda en cuadrícula
decision_tree = DecisionTreeClassifier(criterion='entropy', max_depth=9, class_weight=class_weight_best)
param_grid = {'min_samples_split': min_samples_split_range, 'min_samples_leaf': min_samples_leaf_range}
# Realizar la búsqueda en cuadrícula con validación cruzada
grid_search = GridSearchCV(decision_tree, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X, y)
# Obtener los mejores valores para min_samples_split y min_samples_leaf
min_samples_split_best = grid_search.best_params_['min_samples_split']
min_samples_leaf_best = grid_search.best_params_['min_samples_leaf']
# Utilizar los valores óptimos en el modelo DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='entropy', max_depth=9, class_weight=class_weight_best,
min_samples_split=min_samples_split_best, min_samples_leaf=min_samples_leaf_best)
print(f"Min Samples Split: {min_samples_split_best}")
print(f"Min Samples Leaf: {min_samples_leaf_best}")
Fitting 5 folds for each of 100 candidates, totalling 500 fits Min Samples Split: 2 Min Samples Leaf: 2
entradas = dataframe.drop(['cardiopatia'], axis=1)
# generemos el arbol con parametros por defecto
y_train = dataframe['cardiopatia']
X_train = dataframe.drop(['cardiopatia'], axis=1).values
# Crear Arbol de decision con profundidad
decision_tree = tree.DecisionTreeClassifier(criterion='entropy',
min_samples_split=5,
min_samples_leaf=2,
max_depth = 9,
class_weight={1:1.72})
decision_tree.fit(X_train, y_train,)
# exportar el modelo a archivo .dot
with open(r"tree1.dot", 'w') as f:
f = tree.export_graphviz(decision_tree,
out_file=f,
max_depth = 9,
impurity = True,
feature_names = list(dataframe.drop(['cardiopatia'], axis=1)),
class_names = ['Sano', 'Cardiopatía'],
rounded = True,
filled= True )
# Convertir el archivo .dot a png para poder visualizarlo
check_call(['dot','-Tpng',r'tree1.dot','-o',r'tree1.png'])
PImage("tree1.png")