Notebook sobre Vacunación contra el Covid19 en Costa Rica y el Mundo

elaborada por el Dr. Juan I. Barrios

Instituto Algoritmia, Barcelona. España 2021

In [1]:
!conda activate geo_env
In [2]:
## Aca se importan las librerias necesarias, la mas importante para esta prueba es PANDAS 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pycountry
from bokeh.io import output_notebook, show, output_file
from bokeh.plotting import figure, output_file, save
from bokeh.models import GeoJSONDataSource, LinearColorMapper, ColorBar
from bokeh.palettes import brewer
from datetime import datetime
from bokeh.models import Slider, HoverTool
import pandas as pd
import geopandas as gpd
sns.set_style('darkgrid')
%matplotlib inline
In [3]:
##Con este comando leemos los datos de la web 
dataframe=pd.read_csv('https://github.com/owid/covid-19-data/blob/master/public/data/owid-covid-data.csv?raw=true', error_bad_lines=False)
In [4]:
# Listados los primeros registros del set de datos
dataframe.tail(5)
Out[4]:
iso_code continent location date total_cases new_cases new_cases_smoothed total_deaths new_deaths new_deaths_smoothed ... gdp_per_capita extreme_poverty cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers handwashing_facilities hospital_beds_per_thousand life_expectancy human_development_index
85166 ZWE Africa Zimbabwe 2021-04-26 38102.0 16.0 34.714 1560.0 3.0 1.000 ... 1899.775 21.4 307.846 1.82 1.6 30.7 36.791 1.7 61.49 0.571
85167 ZWE Africa Zimbabwe 2021-04-27 38164.0 62.0 41.286 1565.0 5.0 1.571 ... 1899.775 21.4 307.846 1.82 1.6 30.7 36.791 1.7 61.49 0.571
85168 ZWE Africa Zimbabwe 2021-04-28 38191.0 27.0 30.143 1565.0 0.0 1.429 ... 1899.775 21.4 307.846 1.82 1.6 30.7 36.791 1.7 61.49 0.571
85169 ZWE Africa Zimbabwe 2021-04-29 38235.0 44.0 31.000 1567.0 2.0 1.714 ... 1899.775 21.4 307.846 1.82 1.6 30.7 36.791 1.7 61.49 0.571
85170 ZWE Africa Zimbabwe 2021-04-30 38257.0 22.0 30.286 1567.0 0.0 1.571 ... 1899.775 21.4 307.846 1.82 1.6 30.7 36.791 1.7 61.49 0.571

5 rows × 59 columns

In [5]:
## Escogemos las variables especificas que necesitamos 
df=dataframe
dataframe_pais=dataframe[['location','continent','total_vaccinations','people_fully_vaccinated','population']]
dataframe_pais.tail(5) 
Out[5]:
location continent total_vaccinations people_fully_vaccinated population
85166 Zimbabwe Africa 411610.0 57776.0 14862927.0
85167 Zimbabwe Africa 433939.0 63263.0 14862927.0
85168 Zimbabwe Africa 458013.0 69992.0 14862927.0
85169 Zimbabwe Africa 477597.0 76826.0 14862927.0
85170 Zimbabwe Africa 500342.0 85607.0 14862927.0
In [6]:
##  aca agrupamos los registros de acuerdo a las variables location y continente, creamos un nuevo dataset y le decimos al nuevo data set (df_pais) las variables adicionales que vamos a utilizar
dataframe_pais=dataframe_pais.groupby(['location',])['continent', 'total_vaccinations','people_fully_vaccinated','population'].last().reset_index()
C:\Users\Tommy\Anaconda3\envs\geo_env\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  
In [7]:
## aca listamos ese nuevo dataset pero solo los primeros 5 registros (paises con la función (heading) 
dataframe_pais.head()
Out[7]:
location continent total_vaccinations people_fully_vaccinated population
0 Afghanistan Asia 240000.0 NaN 3.892834e+07
1 Africa None 17827619.0 4876161.0 1.340598e+09
2 Albania Europe 476903.0 655.0 2.877800e+06
3 Algeria Africa 75000.0 NaN 4.385104e+07
4 Andorra Europe 26414.0 4681.0 7.726500e+04

Creamos las nuevas variables del dataframe "_pais"

In [8]:
dataframe_pais['Cobertura']=(dataframe_pais['total_vaccinations']*0.5)/(dataframe_pais['population']*1.25)*100
dataframe_cobertura=dataframe_pais.sort_values('Cobertura',ascending=False)
df1=df[[ 'location', 'date','total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'new_vaccinations',
       'population',]]
df1.dropna(subset=['new_vaccinations'],inplace=True)
cntry=[]
value=[]
for loc in df1.location.unique():
  cntry.append(loc)
  value.append(df1[df1['location']==loc]['new_vaccinations'].rolling(7, center=False).mean().mean())
df1=pd.DataFrame({'location':cntry,'Average_daily_doses':value})
df_full2=pd.merge(dataframe_cobertura,df1,on='location',how='outer')
df_full=pd.merge(dataframe_cobertura,df1,on='location',how='outer')
df_full['Cobertura']=((df_full['total_vaccinations']*0.5)/df_full['population']*1.25)*100
df_full['Days_70%_vaccination']=(((df_full['population']*0.7)-(df_full['total_vaccinations']*0.5))/(df_full['Average_daily_doses']*0.5))
df_full['Percent_Vaccinated']=((df_full['total_vaccinations']*0.5)/df_full['population'])*100
df_full.sort_values('Days_70%_vaccination',ascending=True,inplace=True)
df_full.dropna()
df_full.sort_values('Days_70%_vaccination') 
pd.reset_option('display.max_rows')
pd.set_option("display.max_rows",215)
C:\Users\Tommy\Anaconda3\envs\geo_env\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
In [9]:
# Pais de comprobación
# df_full[df_full['location']=='Spain']

Imprimir el reporte general de países "_salida"

In [10]:
dataframe_salida = df_full
dataframe_salida.rename(columns = {'location':'PAISES', 'continent':'CONTINENTE',
                          'total_vaccinations':'TOTAL VACUNAS APLICADAS', 
                          'Percent_Vaccinated':'PORCENTAJE POBLACION VACUNADA', 
                          'Cobertura': 'POBLACION +DE 18a CUBIERTA'}, inplace = True)
dataframe_salida.dropna(subset = ["PORCENTAJE POBLACION VACUNADA"], inplace=True)
dataframe_salida=dataframe_salida[['PAISES','CONTINENTE','TOTAL VACUNAS APLICADAS','population', 'POBLACION +DE 18a CUBIERTA', 'PORCENTAJE POBLACION VACUNADA']]                          
dataframe_salida2=dataframe_salida.sort_values('PORCENTAJE POBLACION VACUNADA',ascending=False,inplace=True)
pd.options.display.float_format = '{:,.0f}'.format
dataframe_salida.to_html('../vacunas/salidas/listado_general.html')
dataframe_salida.rename(columns = {'location':'PAISES', 'continent':'CONTINENTE',
                          'TOTAL VACUNAS APLICADAS':'total_vaccinations', 
                          'PORCENTAJE POBLACION VACUNADA':'Percent_Vaccinated', 
                          'POBLACION +DE 18a CUBIERTA':'Cobertura'}, inplace = True)
C:\Users\Tommy\Anaconda3\envs\geo_env\lib\site-packages\ipykernel_launcher.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
C:\Users\Tommy\AppData\Roaming\Python\Python37\site-packages\pandas\core\frame.py:4446: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,

Acá calculamos el número de días para llegar al 70% con el "_dias"

In [11]:
dataframe_dias = df_full
#dataframe_dias.dropna(subset = ["Days_70%_vaccination"], inplace=True)
dataframe_dias.rename(columns = {'PAIS':'PAISES','POBLACION':'population','Days_70%_vaccination':'NUMERO DE DIAS 70%'}, inplace = True)
#                              'total_vaccinations':'TOTAL VACUNAS APLICADAS', 
#                              'Percent_Vaccinated':'PORCENTAJE DE LA POBLACION VACUNADA', 
#                              'Days_70%_vaccination':'NUMERO DE DIAS 70%'}, inplace = True)
dataframe_dias=dataframe_dias[['PAISES', 'CONTINENTE','population','NUMERO DE DIAS 70%']]                          
dataframe_dias2=dataframe_dias.sort_values('NUMERO DE DIAS 70%',ascending=False,inplace=True)
pd.options.display.float_format = '{:,.0f}'.format
dataframe_dias.to_html('../vacunas/salidas/listado_dias.html')
dataframe_dias.rename(columns = {'PAIS':'PAISES', 'POBLACION':'population'}, inplace = True)
C:\Users\Tommy\Anaconda3\envs\geo_env\lib\site-packages\ipykernel_launcher.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
C:\Users\Tommy\AppData\Roaming\Python\Python37\site-packages\pandas\core\frame.py:4446: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,

Inicia proceso análisis : Países mas avanzados en el proceso de vacunación

In [12]:
df_full.columns
Out[12]:
Index(['PAISES', 'CONTINENTE', 'TOTAL VACUNAS APLICADAS',
       'people_fully_vaccinated', 'population', 'POBLACION +DE 18a CUBIERTA',
       'Average_daily_doses', 'NUMERO DE DIAS 70%',
       'PORCENTAJE POBLACION VACUNADA'],
      dtype='object')
In [13]:
dataframe_calculos=df_full
dataframe_calculos.dropna
dataframe_calculos_sort=dataframe_calculos.sort_values('POBLACION +DE 18a CUBIERTA',ascending=False)
plt.figure(figsize=(12,8))
plt.title('Los 10 países mas avanzados en el proceso de vacunación')
dataframe_calculos_sort.rename(columns = {'POBLACION +DE 18a CUBIERTA':'Porcentaje de población mayor de 18 años'}, inplace = True)
sns.barplot(x = "PAISES", y = "Porcentaje de población mayor de 18 años", data = dataframe_calculos_sort.head(10))
plt.xticks(rotation=70)
plt.savefig('../vacunas/salidas/los_mejores.jpg')
plt.show()
dataframe_calculos_sort.rename(columns = {'Porcentaje de población mayor de 18 años':'Cobertura'}, inplace = True)

Países mas rezagados con procesos de vacunación activos

In [14]:
dataframe_nulos= dataframe_pais[dataframe_pais['Cobertura'].isnull()]
dataframe_nulos1=dataframe_nulos.head(30)
plt.figure(figsize=(12,8))
plt.title('Los 21 países sin procesos de vacunación')
dataframe_nulos1.rename(columns = {'Cobertura':'Porcentaje de población mayor de 18 años','location':'PAISES' }, inplace = True)
sns.barplot(x = 'PAISES', y = 'Porcentaje de población mayor de 18 años', data = dataframe_nulos1)
plt.xticks(rotation=70)
plt.savefig('../vacunas/salidas/sin_vacuna.jpg')
plt.show()
dataframe_nulos1.rename(columns = {'Porcentaje de población mayor de 18 años':'Cobertura','PAISES':'location'}, inplace = True)
In [15]:
dataframe_calculos.columns
Out[15]:
Index(['PAISES', 'CONTINENTE', 'TOTAL VACUNAS APLICADAS',
       'people_fully_vaccinated', 'population', 'POBLACION +DE 18a CUBIERTA',
       'Average_daily_doses', 'NUMERO DE DIAS 70%',
       'PORCENTAJE POBLACION VACUNADA'],
      dtype='object')
In [16]:
dataframe_calculos.dropna
dataframe_calculos_sort=dataframe_calculos.sort_values('POBLACION +DE 18a CUBIERTA',ascending=True)
plt.figure(figsize=(12,8))
plt.title('Los 20 países mas rezagados en el proceso de vacunación  (de los que ya vacunan) ')
dataframe_calculos_sort.rename(columns = {'POBLACION +DE 18a CUBIERTA':'Porcentaje de población mayor de 18 años','location':'PAISES'}, inplace = True)
sns.barplot(x = "PAISES", y = "Porcentaje de población mayor de 18 años", data = dataframe_calculos_sort.head(20))
plt.xticks(rotation=70)
plt.savefig('../vacunas/salidas/los_rezagados.jpg')
plt.show()
dataframe_calculos_sort.rename(columns = {'Porcentaje de población mayor de 18 años':'Cobertura'}, inplace = True)

COBERTURA DE VACUNACIÓN POR CONTINENTE

Listando los datos de un contiente específico

In [17]:
dataframe_continente=dataframe_pais.groupby('continent')['total_vaccinations','population','Cobertura'].last().reset_index()
dataframe=dataframe.groupby('location')['total_cases','total_deaths','population'].last().reset_index()
dataframe_continente.sort_values('Cobertura',ascending=True)
C:\Users\Tommy\Anaconda3\envs\geo_env\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  """Entry point for launching an IPython kernel.
C:\Users\Tommy\Anaconda3\envs\geo_env\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  
Out[17]:
continent total_vaccinations population Cobertura
1 Asia 506,435 29,825,968 0
5 South America 250,000 28,435,943 0
0 Africa 500,342 14,862,927 1
4 Oceania 5,367 307,150 2
2 Europe 48,748,962 809 29
3 North America 240,159,677 331,002,647 29

Cobertura de vacunación por Continentes

In [18]:
dataframe_continente.rename(columns = {'Cobertura':'Porcentaje de población mayor de 18 años','continent':'CONTINENTES'}, inplace = True)
plt.figure(figsize=(12,8))
sns.barplot(x = "CONTINENTES", y = "Porcentaje de población mayor de 18 años", data = dataframe_continente.sort_values('Porcentaje de población mayor de 18 años',ascending=True))
plt.xticks(rotation=70)
plt.title('Cobertura de vacunación global por continente')
plt.savefig('../vacunas/salidas/cobertura_continente.jpg')
plt.show()
dataframe_continente.rename(columns = {'Porcentaje de población mayor de 18 años':'Cobertura','CONTINENTES':'continent'}, inplace = True)

Cobertura de vacunación en Norte y Centroamérica

Graficando países de Norte y Centro America con respecto a Cobertura

In [19]:
## creamos un nuevo dataset para ver los países con las coberturas mas altas en Norte America
dataframe_continente=dataframe_pais.groupby('continent')['total_vaccinations','population','Cobertura'].last().reset_index()
dataframe=dataframe.groupby('location')['total_cases','total_deaths','population'].last().reset_index()
dataframe_continente_america=dataframe_pais[dataframe_pais['continent']=='North America']
dataframe_continente_america2=dataframe_continente_america.sort_values('Cobertura',ascending=True,)
C:\Users\Tommy\Anaconda3\envs\geo_env\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  
C:\Users\Tommy\Anaconda3\envs\geo_env\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  This is separate from the ipykernel package so we can avoid doing imports until
In [20]:
# Construyendo el grafico
##Quitamos los valores Nan 
plt.figure(figsize=(12,8))
dataframe_continente_america2.dropna(inplace=True)
sns.barplot(x = "location", y = "Cobertura", data = dataframe_continente_america2.head(100))
plt.xticks(rotation=60)
plt.title('America del Norte y Centro America coberturas en vacunación')
plt.savefig('../vacunas/salidas/america.jpg')
plt.show()

Generando un mapa interactivo con BOKEH

In [21]:
df_full.rename(columns = {'PAISES':'location','CONTINENTE':'continent',
                          'TOTAL VACUNAS APLICADAS':'total_vaccinations', 
                          'PORCENTAJE DE LA POBLACION VACUNADA':'Percent_Vaccinated'}, inplace = True)
df_full.columns
Out[21]:
Index(['location', 'continent', 'total_vaccinations',
       'people_fully_vaccinated', 'population', 'POBLACION +DE 18a CUBIERTA',
       'Average_daily_doses', 'NUMERO DE DIAS 70%',
       'PORCENTAJE POBLACION VACUNADA'],
      dtype='object')
In [22]:
import pandas as pd
import geopandas as gpd
shapefile = '../data/countries_110m/ne_110m_admin_0_countries.shp'

gdf = gpd.read_file(shapefile)[['ADMIN', 'ADM0_A3', 'geometry']]
gdf.columns = ['country', 'country_code', 'geometry']
gdf = gdf.drop(gdf.index[159])
gdf.head()
Out[22]:
country country_code geometry
0 Fiji FJI MULTIPOLYGON (((180.00000 -16.06713, 180.00000...
1 United Republic of Tanzania TZA POLYGON ((33.90371 -0.95000, 34.07262 -1.05982...
2 Western Sahara SAH POLYGON ((-8.66559 27.65643, -8.66512 27.58948...
3 Canada CAN MULTIPOLYGON (((-122.84000 49.00000, -122.9742...
4 United States of America USA MULTIPOLYGON (((-122.84000 49.00000, -120.0000...
In [23]:
import pycountry

input_countries = df_full.location.values.tolist()

codes = []
for country in input_countries:
    try:
      codes.append(pycountry.countries.search_fuzzy(country)[0].alpha_3)
    except:
      codes.append('Unknown code')

df_full['country_code']=codes
In [24]:
df_full.columns
Out[24]:
Index(['location', 'continent', 'total_vaccinations',
       'people_fully_vaccinated', 'population', 'POBLACION +DE 18a CUBIERTA',
       'Average_daily_doses', 'NUMERO DE DIAS 70%',
       'PORCENTAJE POBLACION VACUNADA', 'country_code'],
      dtype='object')
In [25]:
#Perform left merge to preserve every row in gdf.
df_full.rename(columns = {'PORCENTAJE POBLACION VACUNADA':'Percent_Vaccinated','POBLACION +DE 18a CUBIERTA':'Cobertura'}, inplace = True)
merged = gdf.merge(df_full[['country_code','Cobertura','Percent_Vaccinated']], left_on = 'country_code', right_on = 'country_code', how = 'left')

#Replace NaN values to string 'No data'.
merged.fillna('No data', inplace = True)

import json

#Read data to json
merged_json = json.loads(merged.to_json())

#Convert to str like object
json_data = json.dumps(merged_json)
In [26]:
df2=df[[ 'location', 'date','total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'new_vaccinations',
       'population',]]
df2['date']=pd.to_datetime(df2['date'])

def mapper(month):
   return month.strftime('%Y-%m') 

df2['Month']=df2['date'].apply(mapper)
C:\Users\Tommy\Anaconda3\envs\geo_env\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
C:\Users\Tommy\Anaconda3\envs\geo_env\lib\site-packages\ipykernel_launcher.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
In [27]:
df2.groupby(['location','Month'])['total_vaccinations','people_fully_vaccinated','population'].last().reset_index()
df2['Cobertura']=(df2['people_fully_vaccinated']/df2['population'])*100
df2['Percent_Vaccinated']=((df2['total_vaccinations']*0.5)/df2['population']*1.25)*100
C:\Users\Tommy\Anaconda3\envs\geo_env\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  """Entry point for launching an IPython kernel.
C:\Users\Tommy\Anaconda3\envs\geo_env\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
C:\Users\Tommy\Anaconda3\envs\geo_env\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
In [28]:
#Perform left merge to preserve every row in gdf.
merged = gdf.merge(df_full[['country_code','Cobertura','Percent_Vaccinated']], left_on = 'country_code', right_on = 'country_code', how = 'left')

#Replace NaN values to string 'No data'.
merged.fillna(0, inplace = True)

import json

#Read data to json
merged_json = json.loads(merged.to_json())

#Convert to str like object
json_data = json.dumps(merged_json)
In [29]:
#Input GeoJSON source that contains features for plotting.
geosource = GeoJSONDataSource(geojson = json_data)
#Define a sequential multi-hue color palette.
palette = brewer['YlGnBu'][8]
#Reverse color order so that dark blue is highest obesity.
palette = palette[::-1]
#Instantiate LinearColorMapper that linearly maps numbers in a range, into a sequence of colors.
color_mapper = LinearColorMapper(palette = palette, low = 0, high = 20)
#Define custom tick labels for color bar.
tick_labels = {'0': '0%', '1': '1%', '5':'5%', '10':'10%', '20':'20%', '50':'50%'}
#Add hover tool
hover = HoverTool(tooltips = [ ('Pais/Region','@country'),('Cobertura población mayor de 18 años', '@Cobertura'),('Porcentaje población vacunada','@Percent_Vaccinated')])
#Create color bar. 
color_bar = ColorBar(color_mapper=color_mapper, label_standoff=8,width = 500, height = 20,
border_line_color=None,location = (0,0), orientation = 'horizontal', major_label_overrides = tick_labels)
#Create figure object.
p = figure(title = f'Cobertura de vacunación COVID por paises al {datetime.now().date()}', plot_height = 600 , plot_width = 950, toolbar_location = None,tools=[hover])
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None
#Add patch renderer to figure. 
p.patches('xs','ys', source = geosource,fill_color = {'field' :'Percent_Vaccinated', 'transform' : color_mapper},
          line_color = 'black', line_width = 0.25, fill_alpha = 1)
#Specify figure layout.
p.add_layout(color_bar, 'below')

#Display figure inline in Jupyter Notebook.
output_notebook()

#Display figure.
show(p)
output_file("../vacunas/salidas/mapai.html")
save(p) 
Loading BokehJS ...
Out[29]:
'D:\\Machine learning\\mapas\\vacunas\\salidas\\mapai.html'
In [ ]:
 
In [30]:
#  Aplicando algoritmo Kmeans a nuestro dataset 
from sklearn.cluster import KMeans
dataframe_clusters = dataframe_pais
dataframe_clusters = df.reset_index()
In [31]:
inertias = [] 
K = range(1,10) 
dataframe_clusters.fillna(0,inplace=True)  
for k in K: 
    #Crear y ajustar el modelo 
    kmeanModel = KMeans(n_clusters=k).fit(dataframe_clusters.drop(['continent','location','iso_code','date','tests_units'],axis=1)) 
    inertias.append(kmeanModel.inertia_) 
    
plt.plot(K, inertias, 'bx-') 
plt.xlabel('VALORES DE K') 
plt.ylabel('Inertia') 
plt.title('EL METODO DEL CODO USANDO INERTIA' ) 
plt.show()
In [32]:
## El numero de clusters o grupos sera de 4 
kmeans = KMeans(n_clusters = 4, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(dataframe_clusters.drop(['continent','location','iso_code','date','tests_units'],axis=1))
y_kmeans1=y_kmeans+1
cluster = pd.DataFrame(y_kmeans1)
today_sub=dataframe_clusters.drop(['continent','location','iso_code','date','tests_units'],axis=1)
# aca añadimos la variable cluster a nuestro nuevo dataset 
today_sub['cluster'] = cluster
# Promedio de los valores del cluster
kmeans_mean_cluster = pd.DataFrame(round(today_sub.groupby('cluster').mean(),1))

## Listando los valores promedios de las variables utilizadas en cada cluster
kmeans_mean_cluster
Out[32]:
index total_cases new_cases new_cases_smoothed total_deaths new_deaths new_deaths_smoothed total_cases_per_million new_cases_per_million new_cases_smoothed_per_million ... gdp_per_capita extreme_poverty cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers handwashing_facilities hospital_beds_per_thousand life_expectancy human_development_index
cluster
1 43,076 394,292 2,724 2,696 10,395 63 62 9,915 73 72 ... 17,683 8 238 7 8 23 23 3 70 1
2 83,736 46,359,287 323,819 318,283 1,132,150 6,839 6,747 5,948 42 41 ... 15,469 10 233 8 6 35 60 3 73 1
3 4,238 11,280,115 83,231 79,990 187,348 1,119 1,086 2,431 18 17 ... 0 0 0 0 0 0 0 0 0 0
4 19,735 5,051,030 37,555 36,685 119,438 741 730 5,646 42 42 ... 5,498 6 137 5 1 18 15 1 37 0

4 rows × 55 columns

In [33]:
## Acá podemos ver el grupo de paises en cada cluster o grupo 
dataframe_clusters_2=dataframe_clusters.copy()
dataframe_clusters_2['cluster']= cluster
In [34]:
##Para listar los paises dentro de cada cluster separados
dataframe_clusters_2[dataframe_clusters_2[('cluster')]==2]
Out[34]:
index iso_code continent location date total_cases new_cases new_cases_smoothed total_deaths new_deaths ... extreme_poverty cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers handwashing_facilities hospital_beds_per_thousand life_expectancy human_development_index cluster
83504 83504 OWID_WRL 0 World 2020-01-22 557 0 0 17 0 ... 10 233 9 6 35 60 3 73 1 2
83505 83505 OWID_WRL 0 World 2020-01-23 655 98 0 18 1 ... 10 233 9 6 35 60 3 73 1 2
83506 83506 OWID_WRL 0 World 2020-01-24 941 286 0 26 8 ... 10 233 9 6 35 60 3 73 1 2
83507 83507 OWID_WRL 0 World 2020-01-25 1,433 492 0 42 16 ... 10 233 9 6 35 60 3 73 1 2
83508 83508 OWID_WRL 0 World 2020-01-26 2,118 685 0 56 14 ... 10 233 9 6 35 60 3 73 1 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
83964 83964 OWID_WRL 0 World 2021-04-26 147,872,402 682,784 824,855 3,120,469 11,156 ... 10 233 9 6 35 60 3 73 1 2
83965 83965 OWID_WRL 0 World 2021-04-27 148,716,872 844,470 823,432 3,134,956 14,487 ... 10 233 9 6 35 60 3 73 1 2
83966 83966 OWID_WRL 0 World 2021-04-28 149,622,864 905,992 825,721 3,150,675 15,719 ... 10 233 9 6 35 60 3 73 1 2
83967 83967 OWID_WRL 0 World 2021-04-29 150,520,466 897,602 825,413 3,165,665 14,990 ... 10 233 9 6 35 60 3 73 1 2
83968 83968 OWID_WRL 0 World 2021-04-30 151,399,480 879,014 822,724 3,180,238 14,573 ... 10 233 9 6 35 60 3 73 1 2

465 rows × 61 columns

In [35]:
dataframe_clusters_2[dataframe_clusters_2['cluster']==3] 
Out[35]:
index iso_code continent location date total_cases new_cases new_cases_smoothed total_deaths new_deaths ... extreme_poverty cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers handwashing_facilities hospital_beds_per_thousand life_expectancy human_development_index cluster
4006 4006 OWID_ASI 0 Asia 2020-01-22 556 0 0 17 0 ... 0 0 0 0 0 0 0 0 0 3
4007 4007 OWID_ASI 0 Asia 2020-01-23 654 98 0 18 1 ... 0 0 0 0 0 0 0 0 0 3
4008 4008 OWID_ASI 0 Asia 2020-01-24 937 283 0 26 8 ... 0 0 0 0 0 0 0 0 0 3
4009 4009 OWID_ASI 0 Asia 2020-01-25 1,428 491 0 42 16 ... 0 0 0 0 0 0 0 0 0 3
4010 4010 OWID_ASI 0 Asia 2020-01-26 2,105 677 0 56 14 ... 0 0 0 0 0 0 0 0 0 3
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4466 4466 OWID_ASI 0 Asia 2021-04-26 37,496,027 440,992 468,236 499,552 4,483 ... 0 0 0 0 0 0 0 0 0 3
4467 4467 OWID_ASI 0 Asia 2021-04-27 37,986,034 490,007 474,154 504,552 5,000 ... 0 0 0 0 0 0 0 0 0 3
4468 4468 OWID_ASI 0 Asia 2021-04-28 38,492,711 506,677 479,168 509,870 5,318 ... 0 0 0 0 0 0 0 0 0 3
4469 4469 OWID_ASI 0 Asia 2021-04-29 39,007,933 515,222 483,807 515,111 5,241 ... 0 0 0 0 0 0 0 0 0 3
4470 4470 OWID_ASI 0 Asia 2021-04-30 39,526,308 518,375 489,099 520,286 5,175 ... 0 0 0 0 0 0 0 0 0 3

465 rows × 61 columns

In [36]:
dataframe_clusters_2[dataframe_clusters_2['cluster']==4] 
Out[36]:
index iso_code continent location date total_cases new_cases new_cases_smoothed total_deaths new_deaths ... extreme_poverty cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers handwashing_facilities hospital_beds_per_thousand life_expectancy human_development_index cluster
432 432 OWID_AFR 0 Africa 2020-02-13 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 4
433 433 OWID_AFR 0 Africa 2020-02-14 1 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 4
434 434 OWID_AFR 0 Africa 2020-02-15 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 4
435 435 OWID_AFR 0 Africa 2020-02-16 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 4
436 436 OWID_AFR 0 Africa 2020-02-17 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
35473 35473 IND Asia India 2021-04-26 17,636,186 323,023 330,745 197,894 2,771 ... 21 282 10 2 21 60 1 70 1 4
35474 35474 IND Asia India 2021-04-27 17,997,113 360,927 340,140 201,187 3,293 ... 21 282 10 2 21 60 1 70 1 4
35475 35475 IND Asia India 2021-04-28 18,376,421 379,308 349,378 204,832 3,645 ... 21 282 10 2 21 60 1 70 1 4
35476 35476 IND Asia India 2021-04-29 18,762,976 386,555 357,040 208,330 3,498 ... 21 282 10 2 21 60 1 70 1 4
35477 35477 IND Asia India 2021-04-30 19,164,969 401,993 364,927 211,853 3,523 ... 21 282 10 2 21 60 1 70 1 4

1829 rows × 61 columns

In [37]:
dataframe_clusters_2[dataframe_clusters_2['cluster']==1] 
Out[37]:
index iso_code continent location date total_cases new_cases new_cases_smoothed total_deaths new_deaths ... extreme_poverty cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers handwashing_facilities hospital_beds_per_thousand life_expectancy human_development_index cluster
0 0 AFG Asia Afghanistan 2020-02-24 1 1 0 0 0 ... 0 597 10 0 0 38 0 65 1 1
1 1 AFG Asia Afghanistan 2020-02-25 1 0 0 0 0 ... 0 597 10 0 0 38 0 65 1 1
2 2 AFG Asia Afghanistan 2020-02-26 1 0 0 0 0 ... 0 597 10 0 0 38 0 65 1 1
3 3 AFG Asia Afghanistan 2020-02-27 1 0 0 0 0 ... 0 597 10 0 0 38 0 65 1 1