import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
dataframe = pd.read_csv(r"antihipertensivos.csv")
dataframe.head(15)
Medicamento | Edad | Sexo | Etnia | IMC | Diabetes | Dislipidemias | Hist_HTA | Tabaquismo | Alcoholismo | Presion_inicial | Presion_actual | Infartos | ECG | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 4 | 1 | 2 | 3 | 2 | 2 | 1 | 3 | 1 | 1 | 2 | 2 | 4 |
1 | 3 | 3 | 2 | 3 | 4 | 1 | 3 | 1 | 1 | 5 | 3 | 1 | 2 | 4 |
2 | 4 | 4 | 2 | 2 | 4 | 1 | 1 | 1 | 5 | 1 | 3 | 1 | 2 | 1 |
3 | 3 | 4 | 2 | 1 | 2 | 3 | 1 | 1 | 2 | 1 | 3 | 3 | 2 | 2 |
4 | 4 | 4 | 2 | 2 | 1 | 1 | 2 | 3 | 1 | 5 | 1 | 3 | 2 | 5 |
5 | 1 | 4 | 2 | 2 | 1 | 3 | 1 | 3 | 4 | 4 | 1 | 1 | 2 | 3 |
6 | 2 | 3 | 2 | 2 | 2 | 3 | 1 | 1 | 4 | 1 | 3 | 1 | 3 | 4 |
7 | 1 | 4 | 2 | 2 | 2 | 3 | 2 | 2 | 2 | 5 | 3 | 1 | 3 | 3 |
8 | 4 | 3 | 2 | 2 | 2 | 1 | 1 | 1 | 5 | 3 | 3 | 4 | 2 | 5 |
9 | 5 | 4 | 2 | 1 | 2 | 1 | 1 | 1 | 3 | 5 | 3 | 3 | 3 | 5 |
10 | 2 | 4 | 2 | 3 | 4 | 1 | 2 | 1 | 5 | 4 | 3 | 3 | 2 | 5 |
11 | 4 | 2 | 2 | 2 | 1 | 1 | 3 | 3 | 4 | 1 | 3 | 2 | 3 | 2 |
12 | 3 | 1 | 2 | 1 | 2 | 1 | 1 | 1 | 5 | 4 | 3 | 2 | 2 | 5 |
13 | 4 | 4 | 2 | 2 | 1 | 1 | 2 | 1 | 5 | 1 | 3 | 1 | 1 | 2 |
14 | 2 | 4 | 2 | 3 | 1 | 2 | 2 | 1 | 5 | 5 | 3 | 3 | 2 | 2 |
dataframe.tail(5)
Medicamento | Edad | Sexo | Etnia | IMC | Diabetes | Dislipidemias | Hist_HTA | Tabaquismo | Alcoholismo | Presion_inicial | Presion_actual | Infartos | ECG | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
49995 | 3 | 4 | 2 | 1 | 3 | 2 | 3 | 1 | 4 | 1 | 3 | 1 | 2 | 3 |
49996 | 1 | 3 | 2 | 2 | 2 | 2 | 2 | 3 | 4 | 3 | 3 | 2 | 2 | 2 |
49997 | 2 | 4 | 2 | 2 | 2 | 1 | 2 | 1 | 2 | 5 | 2 | 4 | 3 | 5 |
49998 | 3 | 3 | 1 | 1 | 3 | 3 | 2 | 2 | 4 | 4 | 3 | 1 | 3 | 2 |
49999 | 4 | 4 | 2 | 2 | 1 | 2 | 2 | 3 | 2 | 1 | 3 | 2 | 1 | 4 |
dataframe.dtypes
Medicamento int64 Edad int64 Sexo int64 Etnia int64 IMC int64 Diabetes int64 Dislipidemias int64 Hist_HTA int64 Tabaquismo int64 Alcoholismo int64 Presion_inicial int64 Presion_actual int64 Infartos int64 ECG int64 dtype: object
# dataframe=dataframe.replace(r'\s+', 0, regex=True)
# dataframe['Edad']=dataframe['Edad'].astype(int)
# dataframe['Sexo']=dataframe['Sexo'].astype(int)
# dataframe['Etnia']=dataframe['Etnia'].astype(int)
# dataframe['IMC']=dataframe['IMC'].astype(int)
dataframe.describe()
Medicamento | Edad | Sexo | Etnia | IMC | Diabetes | Dislipidemias | Hist_HTA | Tabaquismo | Alcoholismo | Presion_inicial | Presion_actual | Infartos | ECG | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.00000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 |
mean | 3.254280 | 3.172120 | 1.842760 | 1.792100 | 1.799600 | 1.92316 | 1.918420 | 1.694700 | 3.712080 | 3.086960 | 2.526720 | 2.031620 | 2.177780 | 2.877700 |
std | 1.391151 | 0.966951 | 0.364031 | 0.707565 | 1.114444 | 0.81402 | 0.811208 | 0.869567 | 1.191256 | 1.569741 | 0.711995 | 1.126697 | 0.582427 | 1.487127 |
min | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 1.00000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
25% | 2.000000 | 3.000000 | 2.000000 | 1.000000 | 1.000000 | 1.00000 | 1.000000 | 1.000000 | 3.000000 | 1.000000 | 2.000000 | 1.000000 | 2.000000 | 2.000000 |
50% | 3.000000 | 3.000000 | 2.000000 | 2.000000 | 2.000000 | 2.00000 | 2.000000 | 1.000000 | 4.000000 | 3.000000 | 3.000000 | 2.000000 | 2.000000 | 2.000000 |
75% | 5.000000 | 4.000000 | 2.000000 | 2.000000 | 3.000000 | 3.00000 | 3.000000 | 3.000000 | 5.000000 | 5.000000 | 3.000000 | 3.000000 | 3.000000 | 5.000000 |
max | 5.000000 | 4.000000 | 2.000000 | 3.000000 | 4.000000 | 3.00000 | 3.000000 | 3.000000 | 5.000000 | 5.000000 | 3.000000 | 4.000000 | 3.000000 | 5.000000 |
print(dataframe.groupby('Medicamento').size())
Medicamento 1 7346 2 8341 3 11828 4 9223 5 13262 dtype: int64
## Ahora examinamos el resto de las variables en histogramas con el siguiente comando
dataframe.drop(['Medicamento'],1).hist()
plt.show()
# sb.pairplot(dataframe.dropna(), hue='Medicamento',height=4,vars=["Edad", "Sexo","Etnia","IMC","Diabetes","Dislipidemias", "Hist_HTA","Alcoholismo","Tabaquismo","Presion_inicial","Presion_actual","Infartos","ECG"],kind='reg')
# Hacemos Regresión logistica para tener un punto de comparación
X = np.array(dataframe.drop(['Medicamento'],1))
y = np.array(dataframe['Medicamento'])
X.shape
(50000, 13)
model = linear_model.LogisticRegression(class_weight={1:1})
model.fit(X,y)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning. FutureWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:469: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning. "this warning.", FutureWarning)
LogisticRegression(C=1.0, class_weight={1: 1}, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class='warn', n_jobs=None, penalty='l2', random_state=None, solver='warn', tol=0.0001, verbose=0, warm_start=False)
model.score(X,y)
0.26606
predictions = model.predict(X)
print(confusion_matrix(y, predictions))
[[ 0 0 192 0 7154] [ 0 0 277 0 8064] [ 0 0 359 0 11469] [ 0 0 268 0 8955] [ 0 0 318 0 12944]]
print(classification_report(y, predictions))
precision recall f1-score support 1 0.00 0.00 0.00 7346 2 0.00 0.00 0.00 8341 3 0.25 0.03 0.05 11828 4 0.00 0.00 0.00 9223 5 0.27 0.98 0.42 13262 accuracy 0.27 50000 macro avg 0.10 0.20 0.09 50000 weighted avg 0.13 0.27 0.12 50000
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. 'precision', 'predicted', average, warn_for)
#arbol
# Imports needed for the script
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('ggplot')
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image, ImageDraw, ImageFont
entradas = dataframe.drop(['Medicamento'], axis=1)
entradas.columns
Index(['Edad', 'Sexo', 'Etnia', 'IMC', 'Diabetes', 'Dislipidemias', 'Hist_HTA', 'Tabaquismo', 'Alcoholismo', 'Presion_inicial', 'Presion_actual', 'Infartos', 'ECG'], dtype='object')
accuracies = list()
max_attributes = 60
depth_range = range(25, max_attributes + 1)
y_train = dataframe['Medicamento']
x_train = dataframe.drop(['Medicamento',], axis=1).values
# Testearemos la profundidad del arbol
for depth in depth_range:
fold_accuracy = []
tree_model = tree.DecisionTreeClassifier(criterion='entropy',
min_samples_split=2,
min_samples_leaf=2,
max_depth = depth,
class_weight={1:1})
model = tree_model.fit(X = x_train,
y = y_train)
valid_acc = model.score(X = x_train,
y = y_train)
# calculamos la precision con el segmento de validacion
fold_accuracy.append(valid_acc)
avg = sum(fold_accuracy)/len(fold_accuracy)
accuracies.append(avg)
# Mostramos los resultados obtenidos
df = pd.DataFrame({"Profundidad": depth_range, "Precision promedio": accuracies})
df = df[["Profundidad", "Precision promedio"]]
print(df.to_string(index=False))
Profundidad Precision promedio 25 0.70360 26 0.70372 27 0.70368 28 0.70384 29 0.70426 30 0.70374 31 0.70400 32 0.70396 33 0.70336 34 0.70384 35 0.70338 36 0.70410 37 0.70386 38 0.70398 39 0.70410 40 0.70346 41 0.70322 42 0.70462 43 0.70386 44 0.70376 45 0.70406 46 0.70360 47 0.70360 48 0.70408 49 0.70448 50 0.70380 51 0.70368 52 0.70382 53 0.70416 54 0.70368 55 0.70410 56 0.70392 57 0.70396 58 0.70436 59 0.70342 60 0.70398
# OJO hay que ajustar los hiperparámetros.
y_train = dataframe['Medicamento']
x_train = dataframe.drop(['Medicamento'], axis=1).values
# Parametros del arbol de decisión
decision_tree = tree.DecisionTreeClassifier(criterion='entropy',
min_samples_split=2,
min_samples_leaf=2,
max_depth = 42,
class_weight={1:1})
decision_tree.fit(x_train, y_train)
# exportar el modelo a archivo .dot
with open(r"tree1.dot", 'w') as f:
f = tree.export_graphviz(decision_tree,
out_file=f,
max_depth = 42,
impurity = True,
feature_names = list(dataframe.drop(['Medicamento'], axis=1)),
class_names = ['1: ENLP', '2: ENLP+HCT', '3: AMLD', '4: IBST+HCT', '5: IBST+HCT'],
rounded = True,
filled= True )
# Convertir el archivo .dot a png para poder visualizarlo
check_call(['dot','-Tpng',r'tree1.dot','-o',r'tree1.png'])
PImage("tree1.png")