import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
dataframe = pd.read_csv(r"https://www.juanbarrios.com/descargas/cardiopatia_generated.csv")
X = np.array(dataframe.drop(['cardiopatia'],1))
y = np.array(dataframe['cardiopatia'])
# para hacer los datos sintéticos se utilizó el método bootstrap
C:\Users\tommy\AppData\Local\Temp\ipykernel_55596\3301660309.py:2: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only. X = np.array(dataframe.drop(['cardiopatia'],1))
# Se listan los primeros 5 filas o las ultimas cinco
dataframe.head(15)
| cardiopatia | Sexe | Tabac | HTA | Diabetis | obesitat | edat_agrup | |
|---|---|---|---|---|---|---|---|
| 0 | 2 | 2 | 2 | 1 | 2 | 2 | 3 |
| 1 | 2 | 1 | 1 | 1 | 2 | 4 | |
| 2 | 1 | 1 | 2 | 1 | 2 | 1 | 4 |
| 3 | 1 | 2 | 2 | 2 | 2 | 2 | 2 |
| 4 | 2 | 2 | 3 | 2 | 2 | 2 | 3 |
| 5 | 1 | 2 | 3 | 2 | 2 | 2 | 3 |
| 6 | 2 | 1 | 2 | 1 | 2 | 0 | 4 |
| 7 | 2 | 1 | 2 | 1 | 2 | 1 | 4 |
| 8 | 2 | 2 | 0 | 1 | 1 | 2 | 4 |
| 9 | 1 | 2 | 3 | 2 | 2 | 2 | 3 |
| 10 | 2 | 1 | 1 | 1 | 2 | 1 | 3 |
| 11 | 2 | 2 | 1 | 1 | 1 | 1 | 3 |
| 12 | 2 | 1 | 1 | 2 | 2 | 2 | 3 |
| 13 | 1 | 2 | 2 | 2 | 2 | 2 | 3 |
| 14 | 2 | 1 | 3 | 2 | 2 | 2 | 3 |
dataframe.tail( 5)
| cardiopatia | Sexe | Tabac | HTA | Diabetis | obesitat | edat_agrup | |
|---|---|---|---|---|---|---|---|
| 1995 | 2 | 1 | 1 | 1 | 2 | 4 | |
| 1996 | 1 | 1 | 3 | 1 | 2 | 2 | 4 |
| 1997 | 2 | 2 | 3 | 2 | 2 | 2 | 3 |
| 1998 | 1 | 2 | 3 | 2 | 2 | 1 | 3 |
| 1999 | 1 | 2 | 1 | 2 | 2 | 2 | 2 |
dataframe.shape
(2000, 7)
dataframe.dtypes
cardiopatia int64 Sexe int64 Tabac object HTA object Diabetis object obesitat object edat_agrup int64 dtype: object
dataframe=dataframe.replace(r'\s+', 0, regex=True)
dataframe['Tabac']=dataframe['Tabac'].astype(int)
dataframe['HTA']=dataframe['HTA'].astype(int)
dataframe['Diabetis']=dataframe['Diabetis'].astype(int)
dataframe['obesitat']=dataframe['obesitat'].astype(int)
dataframe.head(20)
| cardiopatia | Sexe | Tabac | HTA | Diabetis | obesitat | edat_agrup | |
|---|---|---|---|---|---|---|---|
| 0 | 2 | 2 | 2 | 1 | 2 | 2 | 3 |
| 1 | 2 | 1 | 1 | 1 | 2 | 0 | 4 |
| 2 | 1 | 1 | 2 | 1 | 2 | 1 | 4 |
| 3 | 1 | 2 | 2 | 2 | 2 | 2 | 2 |
| 4 | 2 | 2 | 3 | 2 | 2 | 2 | 3 |
| 5 | 1 | 2 | 3 | 2 | 2 | 2 | 3 |
| 6 | 2 | 1 | 2 | 1 | 2 | 0 | 4 |
| 7 | 2 | 1 | 2 | 1 | 2 | 1 | 4 |
| 8 | 2 | 2 | 0 | 1 | 1 | 2 | 4 |
| 9 | 1 | 2 | 3 | 2 | 2 | 2 | 3 |
| 10 | 2 | 1 | 1 | 1 | 2 | 1 | 3 |
| 11 | 2 | 2 | 1 | 1 | 1 | 1 | 3 |
| 12 | 2 | 1 | 1 | 2 | 2 | 2 | 3 |
| 13 | 1 | 2 | 2 | 2 | 2 | 2 | 3 |
| 14 | 2 | 1 | 3 | 2 | 2 | 2 | 3 |
| 15 | 1 | 1 | 2 | 1 | 2 | 2 | 4 |
| 16 | 1 | 1 | 1 | 2 | 2 | 2 | 2 |
| 17 | 1 | 2 | 3 | 2 | 2 | 2 | 3 |
| 18 | 2 | 1 | 2 | 1 | 2 | 1 | 4 |
| 19 | 2 | 2 | 0 | 1 | 2 | 2 | 4 |
# Dividir los datos en conjuntos de entrenamiento y prueba
# Aca se definen los 4 grupos de datos: test y train tanto para X como para y
X = dataframe.drop(['cardiopatia'], axis=1)
y = dataframe['cardiopatia']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 80% for training, 20% for testing
print(dataframe.groupby('cardiopatia').size())
cardiopatia 1 735 2 1265 dtype: int64
# Acá se le asigna a las clases un diccionario especifico
unique_classes, class_counts = np.unique(y, return_counts=True)
class_weight_best = {class_id: count for class_id, count in zip(unique_classes, class_counts)}
print(class_weight_best)
{1: 735, 2: 1265}
dataframe.hist()
plt.show()
#sb.pairplot(dataframe.dropna(), hue='cardiopatia',height=4,vars=["Sexe", "edat_agrup","Tabac","HTA","Diabetis","obesitat"],kind='reg')
dataframe.dtypes
cardiopatia int64 Sexe int64 Tabac int32 HTA int32 Diabetis int32 obesitat int32 edat_agrup int64 dtype: object
logistic_model = LogisticRegression(class_weight=class_weight_best)
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)
y_pred_train = logistic_model.predict(X_train)
y_pred_test = logistic_model.predict(X_test)
accuracy_train_logistic = accuracy_score(y_train, y_pred_train)
accuracy_test_logistic = accuracy_score(y_test, y_pred_test)
print("Modelo de regresión logística set de entrenamiento:", accuracy_train_logistic)
print("modelo de religión logística set de test", accuracy_test_logistic)
Modelo de regresión logística set de entrenamiento: 0.634375 modelo de religión logística set de test 0.6225
# Matriz de confusion
y_pred = logistic_model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
[[ 24 126] [ 25 225]]
matriz_confusion = confusion_matrix(y_test, y_pred)
def imprimir_matriz_confusion(matriz):
VN, FP = matriz[0]
FN, VP = matriz[1]
error_tipo_1 = FP / (FP + VN)
error_tipo_2 = FN / (FN + VP)
print(f"Verdaderos Negativos (VN): {VN}")
print(f"Falsos Positivos (FP): {FP}")
print(f"Falsos Negativos (FN): {FN}")
print(f"Verdaderos Positivos (VP): {VP}\n")
print(f"Error Tipo 1: {error_tipo_1:.2f}")
print(f"Error Tipo 2: {error_tipo_2:.2f}")
imprimir_matriz_confusion(matriz_confusion)
Verdaderos Negativos (VN): 24 Falsos Positivos (FP): 126 Falsos Negativos (FN): 25 Verdaderos Positivos (VP): 225 Error Tipo 1: 0.84 Error Tipo 2: 0.10
# Reporte de clasificación
print(classification_report(y_test, y_pred))
precision recall f1-score support
1 0.49 0.16 0.24 150
2 0.64 0.90 0.75 250
accuracy 0.62 400
macro avg 0.57 0.53 0.49 400
weighted avg 0.58 0.62 0.56 400
# Calcular la precisión usando model.score
score = logistic_model.score(X_test, y_test)
print(f"Precisión usando model.score: {score}")
# Calcular la precisión usando accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Precisión usando accuracy_score: {accuracy}")
Precisión usando model.score: 0.6225 Precisión usando accuracy_score: 0.6225
Aunque ambos metodos: accuracy_score y model.score calculan la misma métrica (precisión del modelo), la principal diferencia es cómo se usan. Mientras que accuracy_score toma como entrada las etiquetas verdaderas y las predicciones hechas por el modelo, model.score toma las características y las etiquetas verdaderas y realiza las predicciones internamente antes de calcular la precisión.
# otras librerias especificas para los arboles
plt.rcParams['figure.figsize'] = (16, 16)
plt.style.use('ggplot')
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image, ImageDraw, ImageFont
## aplicamos el modelo de validación cruzada con grid search
from sklearn.model_selection import GridSearchCV
decision_tree = tree.DecisionTreeClassifier
# Definir los rangos de valores para min_samples_split y min_samples_leaf
min_samples_split_range = range(2, 50, 5)
min_samples_leaf_range = range(2, 50, 5)
# Definir el modelo base y los parámetros para la búsqueda en cuadrícula
decision_tree = DecisionTreeClassifier(criterion='entropy', max_depth=9, class_weight=class_weight_best)
param_grid = {'min_samples_split': min_samples_split_range, 'min_samples_leaf': min_samples_leaf_range}
# Realizar la búsqueda en cuadrícula con validación cruzada
grid_search = GridSearchCV(decision_tree, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X, y)
# Obtener los mejores valores para min_samples_split y min_samples_leaf
min_samples_split_best = grid_search.best_params_['min_samples_split']
min_samples_leaf_best = grid_search.best_params_['min_samples_leaf']
# Utilizar los valores óptimos en el modelo DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='entropy', max_depth=9, class_weight=class_weight_best,
min_samples_split=min_samples_split_best, min_samples_leaf=min_samples_leaf_best)
print(f"Min Samples Split: {min_samples_split_best}")
print(f"Min Samples Leaf: {min_samples_leaf_best}")
Fitting 5 folds for each of 100 candidates, totalling 500 fits Min Samples Split: 2 Min Samples Leaf: 2
entradas = dataframe.drop(['cardiopatia'], axis=1)
# generemos el arbol con parametros por defecto
y_train = dataframe['cardiopatia']
X_train = dataframe.drop(['cardiopatia'], axis=1).values
# Crear Arbol de decision con profundidad
decision_tree = tree.DecisionTreeClassifier(criterion='entropy',
min_samples_split=5,
min_samples_leaf=2,
max_depth = 9,
class_weight={1:1.72})
decision_tree.fit(X_train, y_train,)
# exportar el modelo a archivo .dot
with open(r"tree1.dot", 'w') as f:
f = tree.export_graphviz(decision_tree,
out_file=f,
max_depth = 9,
impurity = True,
feature_names = list(dataframe.drop(['cardiopatia'], axis=1)),
class_names = ['Sano', 'Cardiopatía'],
rounded = True,
filled= True )
# Convertir el archivo .dot a png para poder visualizarlo
check_call(['dot','-Tpng',r'tree1.dot','-o',r'tree1.png'])
PImage("tree1.png")
#Precision global del modelo (Accuracy)
y_pred = decision_tree.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
Accuracy: 0.915
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py:443: UserWarning: X has feature names, but DecisionTreeClassifier was fitted without feature names warnings.warn(
#Precision del modelo
decision_tree.score(X,y)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py:443: UserWarning: X has feature names, but DecisionTreeClassifier was fitted without feature names warnings.warn(
0.9145
Aunque ambos metodos: accuracy_score y model.score calculan la misma métrica (precisión del modelo), la principal diferencia es cómo se usan. Mientras que accuracy_score toma como entrada las etiquetas verdaderas y las predicciones hechas por el modelo, model.score toma las características y las etiquetas verdaderas y realiza las predicciones internamente antes de calcular la precisión.
# Validación cruzada
cv_scores = cross_val_score(model, X_train, y_train, cv=5)
# Imprimir la precisión de cada fold
for i, score in enumerate(cv_scores, start=1):
print(f"Precisión del fold {i}: {score:.3f}")
# Calcular e imprimir la precisión promedio en la validación cruzada
print(f"Precision promedio en la validación cruzada: {np.mean(cv_scores):.2f}")
Precisión del fold 1: 0.900 Precisión del fold 2: 0.915 Precisión del fold 3: 0.905 Precisión del fold 4: 0.935 Precisión del fold 5: 0.907 Precision promedio en la validación cruzada: 0.91
# Predicción en el set de entrenamiento
y_pred_train = decision_tree.predict(X_train)
# predicción en el set de prueba
y_pred_test = decision_tree.predict(X_test)
accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Training set accuracy:", accuracy_train)
print("Test set accuracy:", accuracy_test)
Training set accuracy: 0.9145 Test set accuracy: 0.915
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py:443: UserWarning: X has feature names, but DecisionTreeClassifier was fitted without feature names warnings.warn(
# Matriz de confusion
print(confusion_matrix(y_test, y_pred_test))
[[137 13] [ 21 229]]
matriz_confusion = confusion_matrix(y_test, y_pred)
def imprimir_matriz_confusion(matriz):
VN, FP = matriz[0]
FN, VP = matriz[1]
error_tipo_1 = FP / (FP + VN)
error_tipo_2 = FN / (FN + VP)
print(f"Verdaderos Negativos (VN): {VN}")
print(f"Falsos Positivos (FP): {FP}")
print(f"Falsos Negativos (FN): {FN}")
print(f"Verdaderos Positivos (VP): {VP}\n")
print(f"Error Tipo 1: {error_tipo_1:.2f}")
print(f"Error Tipo 2: {error_tipo_2:.2f}")
imprimir_matriz_confusion(matriz_confusion)
Verdaderos Negativos (VN): 137 Falsos Positivos (FP): 13 Falsos Negativos (FN): 21 Verdaderos Positivos (VP): 229 Error Tipo 1: 0.09 Error Tipo 2: 0.08
# reporte de clasificacion
print(classification_report(y_test, y_pred_test))
precision recall f1-score support
1 0.87 0.91 0.89 150
2 0.95 0.92 0.93 250
accuracy 0.92 400
macro avg 0.91 0.91 0.91 400
weighted avg 0.92 0.92 0.92 400
accuracies = list()
max_attributes = 12
depth_range = range(1, max_attributes + 1)
y_train = dataframe['cardiopatia']
x_train = dataframe.drop(['cardiopatia',], axis=1).values
# Testearemos la profundidad de 1 a cantidad de atributos +1
for depth in depth_range:
fold_accuracy = []
tree_model = tree.DecisionTreeClassifier(criterion='entropy',
min_samples_split=min_samples_split_best,
min_samples_leaf=min_samples_leaf_best,
max_depth=depth, class_weight=class_weight_best)
model = tree_model.fit(X = x_train,
y = y_train)
valid_acc = model.score(X = x_train,
y = y_train) # calculamos la precision con el segmento de validacion
fold_accuracy.append(valid_acc)
avg = sum(fold_accuracy)/len(fold_accuracy)
accuracies.append(avg)
# Mostramos los resultados obtenidos
df = pd.DataFrame({"Max Depth": depth_range, "Average Accuracy": accuracies})
df = df[["Max Depth", "Average Accuracy"]]
print(df.to_string(index=False))
Max Depth Average Accuracy
1 0.6325
2 0.6720
3 0.7540
4 0.7920
5 0.7920
6 0.8660
7 0.8915
8 0.9030
9 0.9085
10 0.9110
11 0.9110
12 0.9110
y_train = dataframe['cardiopatia']
x_train = dataframe.drop(['cardiopatia'], axis=1).values
# Crear Arbol de decision con profundidad
decision_tree = tree.DecisionTreeClassifier(criterion='entropy',
min_samples_split=min_samples_split_best,
min_samples_leaf=min_samples_leaf_best,
max_depth=10, class_weight=class_weight_best)
decision_tree.fit(x_train, y_train)
# exportar el modelo a archivo .dot
with open(r"tree1.dot", 'w') as f:
f = tree.export_graphviz(decision_tree,
out_file=f,
max_depth = 10,
impurity = True,
feature_names = list(dataframe.drop(['cardiopatia'], axis=1)),
class_names = ['Sano', 'Cardiopatía'],
rounded = True,
filled= True )
# Convertir el archivo .dot a png para poder visualizarlo
check_call(['dot','-Tpng',r'tree1.dot','-o',r'tree1.png'])
PImage("tree1.png")
#Precision del modelo
decision_tree.score(X,y)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py:443: UserWarning: X has feature names, but DecisionTreeClassifier was fitted without feature names warnings.warn(
0.911
# resultado 1 = Sano , 2 = Cardiopatía
#Sexo (1= mujer , 2 = hombre)
#Tabaco (1 si, 2 no, 3 antes, 4 NS/NR)
#HTA (1 si, 2 no, 3 antes, 4 NS/NR)
#Diabetes (1 si, 2 no, 3 antes, 4 NS/NR)
#Obesidad (1 si, 2 no, 3 antes, 4 NS/NR)
#Grupos de edad (1= <20 2= 20-44 3= 45a64 4 > 65)
# verificando el orden de las variables
entradas.columns
Index(['Sexe', 'Tabac', 'HTA', 'Diabetis', 'obesitat', 'edat_agrup'], dtype='object')
# Solicitar al usuario los valores de las variables
sexo = int(input("Ingrese el valor para Sexo (1= mujer, 2= hombre): "))
tabaco = int(input("Ingrese el valor para Tabaco (1= sí, 2= no, 3= antes, 4= NS/NR): "))
hta = int(input("Ingrese el valor para HTA (1= sí, 2= no, 3= antes, 4= NS/NR): "))
diabetes = int(input("Ingrese el valor para Diabetes (1= sí, 2= no, 3= antes, 4= NS/NR): "))
obesidad = int(input("Ingrese el valor para Obesidad (1= sí, 2= no, 3= antes, 4= NS/NR): "))
edad_agrup = int(input("Ingrese el valor para Grupo de edad (1= <20, 2= 20-44, 3= 45-64, 4= >65): "))
# Almacenar los valores de las variables en una lista
input_values = [sexo, tabaco, hta, diabetes, obesidad, edad_agrup]
# Convertir los valores ingresados por el usuario en un DataFrame de Pandas
input_df = pd.DataFrame([input_values], columns=X.columns)
# Realizar la predicción utilizando el modelo de árbol de decisión y los valores ingresados por el usuario
output_value = decision_tree.predict(input_df)
# Mostrar la salida predicha en la pantalla
if output_value == 1:
print("La salida predicha es: Tiene un perfil de paciente Sano")
else:
print("La salida predicha es: Tiene un perfil de paciente mas propenso a Cardiopatía")
Ingrese el valor para Sexo (1= mujer, 2= hombre): 2 Ingrese el valor para Tabaco (1= sí, 2= no, 3= antes, 4= NS/NR): 2 Ingrese el valor para HTA (1= sí, 2= no, 3= antes, 4= NS/NR): 2 Ingrese el valor para Diabetes (1= sí, 2= no, 3= antes, 4= NS/NR): 2 Ingrese el valor para Obesidad (1= sí, 2= no, 3= antes, 4= NS/NR): 2 Ingrese el valor para Grupo de edad (1= <20, 2= 20-44, 3= 45-64, 4= >65): 3 La salida predicha es: Tiene un perfil de paciente mas propenso a Cardiopatía
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py:443: UserWarning: X has feature names, but DecisionTreeClassifier was fitted without feature names warnings.warn(
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
print('Loading data...')
# Cargando el data set
data = pd.read_csv("cardiopatia_generated.csv")
# Transformando los espacios en blanco y convirtiendo las columnas enteros utilizando expresiones regulares
data = data.replace(r'\s+', 0, regex=True)
data['Tabac'] = data['Tabac'].astype(int)
data['HTA'] = data['HTA'].astype(int)
data['Diabetis'] = data['Diabetis'].astype(int)
data['obesitat'] = data['obesitat'].astype(int)
Loading data...
# Prepare the data
y = data['cardiopatia']
X = data.drop('cardiopatia', axis=1)
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create LightGBM datasets
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# Specify the settings as a dict
params = {
'boosting_type': 'gbdt',
'objective': 'multiclass',
'num_class': 6,
'metric': 'multi_logloss',
'num_leaves': 40,
'learning_rate': 0.01,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
"n_estimators": 100,
"max_depth": 35,
"min_child_samples": 2,
"verbose": 0
}
print('Starting training...')
# Train the model
gbm = lgb.train(params,
lgb_train,
valid_sets=lgb_eval,
early_stopping_rounds=10)
print('Saving model...')
# Save the model
gbm.save_model('model.txt')
Starting training... [LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000037 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [1] valid_0's multi_logloss: 0.652521 Training until validation scores don't improve for 10 rounds [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [2] valid_0's multi_logloss: 0.643333 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [3] valid_0's multi_logloss: 0.63469 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [4] valid_0's multi_logloss: 0.626035 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [5] valid_0's multi_logloss: 0.618375 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [6] valid_0's multi_logloss: 0.611535 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [7] valid_0's multi_logloss: 0.60207 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [8] valid_0's multi_logloss: 0.594369 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [9] valid_0's multi_logloss: 0.586637 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [10] valid_0's multi_logloss: 0.579542 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [11] valid_0's multi_logloss: 0.572547 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [12] valid_0's multi_logloss: 0.566201 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [13] valid_0's multi_logloss: 0.558256 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [14] valid_0's multi_logloss: 0.551352 [15] valid_0's multi_logloss: 0.542761 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [16] valid_0's multi_logloss: 0.536289 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [17] valid_0's multi_logloss: 0.529188 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [18] valid_0's multi_logloss: 0.52322 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [19] valid_0's multi_logloss: 0.517417 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [20] valid_0's multi_logloss: 0.512346 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [21] valid_0's multi_logloss: 0.50701 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [22] valid_0's multi_logloss: 0.502384 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [23] valid_0's multi_logloss: 0.497189 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [24] valid_0's multi_logloss: 0.492768 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [25] valid_0's multi_logloss: 0.488442 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [26] valid_0's multi_logloss: 0.482621 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [27] valid_0's multi_logloss: 0.476871 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [28] valid_0's multi_logloss: 0.47206 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [29] valid_0's multi_logloss: 0.467388 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [30] valid_0's multi_logloss: 0.46201 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [31] valid_0's multi_logloss: 0.457841 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [32] valid_0's multi_logloss: 0.45374 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [33] valid_0's multi_logloss: 0.449507 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [34] valid_0's multi_logloss: 0.444728 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [35] valid_0's multi_logloss: 0.440508 [36] valid_0's multi_logloss: 0.435756 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [37] valid_0's multi_logloss: 0.431916 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [38] valid_0's multi_logloss: 0.427459 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [39] valid_0's multi_logloss: 0.422902 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [40] valid_0's multi_logloss: 0.418642 [41] valid_0's multi_logloss: 0.413776 [42] valid_0's multi_logloss: 0.40903 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [43] valid_0's multi_logloss: 0.406017 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [44] valid_0's multi_logloss: 0.40326 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [45] valid_0's multi_logloss: 0.400238 [46] valid_0's multi_logloss: 0.395748 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [47] valid_0's multi_logloss: 0.392457 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [48] valid_0's multi_logloss: 0.389714 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [49] valid_0's multi_logloss: 0.387029 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [50] valid_0's multi_logloss: 0.384079 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [51] valid_0's multi_logloss: 0.380982 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [52] valid_0's multi_logloss: 0.378918 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [53] valid_0's multi_logloss: 0.375616 [54] valid_0's multi_logloss: 0.371742 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [55] valid_0's multi_logloss: 0.369202 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [56] valid_0's multi_logloss: 0.366354 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [57] valid_0's multi_logloss: 0.363479 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [58] valid_0's multi_logloss: 0.361108 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [59] valid_0's multi_logloss: 0.358849 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [60] valid_0's multi_logloss: 0.356721 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [61] valid_0's multi_logloss: 0.354245 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [62] valid_0's multi_logloss: 0.352287 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [63] valid_0's multi_logloss: 0.350201 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [64] valid_0's multi_logloss: 0.347366 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [65] valid_0's multi_logloss: 0.345246 [66] valid_0's multi_logloss: 0.341988 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [67] valid_0's multi_logloss: 0.339286 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [68] valid_0's multi_logloss: 0.337138 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [69] valid_0's multi_logloss: 0.334824 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [70] valid_0's multi_logloss: 0.332588 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [71] valid_0's multi_logloss: 0.330509 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [72] valid_0's multi_logloss: 0.327961 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [73] valid_0's multi_logloss: 0.326206 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [74] valid_0's multi_logloss: 0.324233 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [75] valid_0's multi_logloss: 0.32254 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [76] valid_0's multi_logloss: 0.320697 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [77] valid_0's multi_logloss: 0.319129 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [78] valid_0's multi_logloss: 0.317343 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [79] valid_0's multi_logloss: 0.315563 [80] valid_0's multi_logloss: 0.313388 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [81] valid_0's multi_logloss: 0.311498 [82] valid_0's multi_logloss: 0.308989 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [83] valid_0's multi_logloss: 0.307253 [84] valid_0's multi_logloss: 0.304835 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [85] valid_0's multi_logloss: 0.303483 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [86] valid_0's multi_logloss: 0.301843 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [87] valid_0's multi_logloss: 0.300267 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [88] valid_0's multi_logloss: 0.298682 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [89] valid_0's multi_logloss: 0.297015 [90] valid_0's multi_logloss: 0.295146 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [91] valid_0's multi_logloss: 0.293323 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [92] valid_0's multi_logloss: 0.292129 [93] valid_0's multi_logloss: 0.29074 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [94] valid_0's multi_logloss: 0.289344 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [95] valid_0's multi_logloss: 0.287633 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [96] valid_0's multi_logloss: 0.286232 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [97] valid_0's multi_logloss: 0.284559 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [98] valid_0's multi_logloss: 0.282917 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [99] valid_0's multi_logloss: 0.281814 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [100] valid_0's multi_logloss: 0.280249 Did not meet early stopping. Best iteration is: [100] valid_0's multi_logloss: 0.280249 Saving model...
C:\ProgramData\Anaconda3\lib\site-packages\lightgbm\engine.py:148: UserWarning: Found `n_estimators` in params. Will use it instead of argument
_log_warning("Found `{}` in params. Will use it instead of argument".format(alias))
<lightgbm.basic.Booster at 0x14b1d3447f0>
print('Starting predicting...')
# Predecir en el conjunto de prueba
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
y_pred = [list(x).index(max(x)) for x in y_pred]
# Calcular la precisión y la matriz de confusión
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
class_report = classification_report(y_test, y_pred)
print("Reporte de clasificación:")
print(class_report)
Starting predicting...
Accuracy: 0.915
Confusion Matrix:
[[137 13]
[ 21 229]]
Reporte de clasificación:
precision recall f1-score support
1 0.87 0.91 0.89 150
2 0.95 0.92 0.93 250
accuracy 0.92 400
macro avg 0.91 0.91 0.91 400
weighted avg 0.92 0.92 0.92 400