1. Introduction

[1]:
import datetime
from pathlib import Path
import os
[2]:
import matplotlib.pyplot as plt
import pandas as pd
import requests
import seaborn as sns

1.1. Config

We define a base directory path

[3]:
BASE_FILE_DIR = Path("temp")

then we create a string that we can then edit to our liking with the .format() method

[4]:
file_path_crib = 'data/{category}/{year}-{month:02d}/{province}-{day:02d}-{month:02d}-{year}.csv'

We create a list of dictionaries containing dictionaries with keys name and url of each of the dataset to extract

[5]:
categories = [
    {
        "name": "pharmacies",
        "url": "http://datos.salud.gob.ar/dataset/39117f8f-e2bc-4571-a572-15a6ce7ea9e1/resource/19338ea7-a492-4af3-b212-18f8f4af9184/download/establecimientos-farmacias-enero-2021.csv"
    }
]

1.2. Extract

[6]:
farmacias = categories[0]
farmacias
[6]:
{'name': 'pharmacies',
 'url': 'http://datos.salud.gob.ar/dataset/39117f8f-e2bc-4571-a572-15a6ce7ea9e1/resource/19338ea7-a492-4af3-b212-18f8f4af9184/download/establecimientos-farmacias-enero-2021.csv'}
[7]:
r = requests.get(farmacias["url"])
r.encoding = 'utf-8'
r
[7]:
<Response [200]>

create a path string

[8]:
now = datetime.datetime.now()
now
# create a path string
file_path = file_path_crib.format(
    category=farmacias["name"],
    year=now.year,
    month=now.month,
    day=now.day,
    province="Argentina"
)
file_path
[8]:
'data/pharmacies/2022-10/Argentina-16-10-2022.csv'
[9]:
farmacias_path = BASE_FILE_DIR / file_path
farmacias_path
[9]:
PosixPath('temp/data/pharmacies/2022-10/Argentina-16-10-2022.csv')
[10]:
farmacias_path.parent.mkdir(parents=True, exist_ok=True)

extract all pharmacies

[11]:
with open(farmacias_path, "w") as f:
    f.write(r.text)

1.3. Transform

[12]:
df = pd.read_csv(farmacias_path)
df.head()
[12]:
establecimiento_id establecimiento_nombre localidad_id localidad_nombre provincia_id provincia_nombre departamento_id departamento_nombre cod_loc cod_ent origen_financiamiento tipologia_id tipologia_nombre tipologia cp domicilio sitio_web
0 70260072329721 PUJOL 26007020000 PUERTO MADRYN 26 CHUBUT 7 BIEDMA 20 0.0 Privado 70 FARMACIA Farmacia ambulatoria comercial 9120 LOS MAITENES 1538 NaN
1 70100352324743 SALUDFARMA 10035020000 BELEN 10 CATAMARCA 35 BELÉN 20 0.0 Privado 70 FARMACIA Farmacia ambulatoria comercial 4750 Mitre 14 NaN
2 70064412318286 MARSICO 6441030015 LA PLATA 6 BUENOS AIRES 441 LA PLATA 30 15.0 Privado 70 FARMACIA Farmacia ambulatoria comercial 1900 Avenida 7 445 NaN
3 70340492347884 PERALTA HNOS 34049010000 CLORINDA 34 FORMOSA 49 PILCOMAYO 10 0.0 Privado 70 FARMACIA Farmacia ambulatoria comercial 3610 Avenida San Martin 470 NaN
4 70140142334991 VALENTINO 14014010000 CORDOBA 14 CÓRDOBA 14 CAPITAL 10 0.0 Privado 70 FARMACIA Farmacia ambulatoria comercial 5000 Catamarca 1047 NaN
[13]:

df.isnull().sum()
[13]:
establecimiento_id            0
establecimiento_nombre        0
localidad_id                  0
localidad_nombre              0
provincia_id                  0
provincia_nombre              0
departamento_id               0
departamento_nombre           0
cod_loc                       0
cod_ent                     384
origen_financiamiento         0
tipologia_id                  0
tipologia_nombre              0
tipologia                     0
cp                            0
domicilio                     0
sitio_web                 13633
dtype: int64
[14]:
df.columns
[14]:
Index(['establecimiento_id', 'establecimiento_nombre', 'localidad_id',
       'localidad_nombre', 'provincia_id', 'provincia_nombre',
       'departamento_id', 'departamento_nombre', 'cod_loc', 'cod_ent ',
       'origen_financiamiento', 'tipologia_id', 'tipologia_nombre',
       'tipologia', 'cp', 'domicilio', 'sitio_web'],
      dtype='object')

Renombrar las columnas

[15]:
renamed_cols = {
    'establecimiento_id': 'id',
    'establecimiento_nombre': 'name',
    'domicilio':'adress',
    'localidad_id': 'id_location',
    'localidad_nombre': 'location',
    'provincia_id': 'id_province',
    'provincia_nombre': 'province',
    'departamento_id': 'id_department',
    'departamento_nombre': 'department',
    'cod_loc': 'cod_location',
    'tipologia_id': 'id_tipology',
    'tipologia_nombre': 'tipology',
    'cp': 'postal_code',
    'sitio_web': 'webpage',
}
[16]:
df = df.rename(columns = renamed_cols)
[17]:
df.columns
df.set_index("id")
[17]:
name id_location location id_province province id_department department cod_location cod_ent origen_financiamiento id_tipology tipology tipologia postal_code adress webpage
id
70260072329721 PUJOL 26007020000 PUERTO MADRYN 26 CHUBUT 7 BIEDMA 20 0.0 Privado 70 FARMACIA Farmacia ambulatoria comercial 9120 LOS MAITENES 1538 NaN
70100352324743 SALUDFARMA 10035020000 BELEN 10 CATAMARCA 35 BELÉN 20 0.0 Privado 70 FARMACIA Farmacia ambulatoria comercial 4750 Mitre 14 NaN
70064412318286 MARSICO 6441030015 LA PLATA 6 BUENOS AIRES 441 LA PLATA 30 15.0 Privado 70 FARMACIA Farmacia ambulatoria comercial 1900 Avenida 7 445 NaN
70340492347884 PERALTA HNOS 34049010000 CLORINDA 34 FORMOSA 49 PILCOMAYO 10 0.0 Privado 70 FARMACIA Farmacia ambulatoria comercial 3610 Avenida San Martin 470 NaN
70140142334991 VALENTINO 14014010000 CORDOBA 14 CÓRDOBA 14 CAPITAL 10 0.0 Privado 70 FARMACIA Farmacia ambulatoria comercial 5000 Catamarca 1047 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
70460212355713 FARMACOST 46021020000 ANILLACO 46 LA RIOJA 21 CASTRO BARROS 20 0.0 Privado 70 FARMACIA Farmacia ambulatoria comercial 5301 Coronel Nicolas Barros s/n Barrio Virgen del F... NaN
70421472354613 FARMACIA MUSSO 42147030000 TRENEL 42 LA PAMPA 147 TRENEL 30 0.0 Privado 70 FARMACIA Farmacia ambulatoria comercial 6369 9 DE JULIO 545 NaN
70940142195567 FARMACIA- HOSPITAL REGIONAL USHUAIA 94014020000 USHUAIA 94 TIERRA DEL FUEGO 14 USHUAIA 20 0.0 Provincial 70 FARMACIA Farmacia en Establecimiento de Salud 9410 12 de octubre 65 NaN
70420702154608 FARMACIA INSTITUCIONAL E.A. MANUEL FREIRE 42070030000 GUATRACHE 42 LA PAMPA 70 GUATRACHÉ 30 0.0 Provincial 70 FARMACIA Farmacia en Establecimiento de Salud 6309 Primera Junta Nº 158 NaN
70064272320083 CABRERA ESCOBAR 6427010008 RAFAEL CASTILLO 6 BUENOS AIRES 427 LA MATANZA 10 8.0 Privado 70 FARMACIA Farmacia ambulatoria comercial 1755 INT.FEDERICO P.RUSSO 1426 NaN

13677 rows × 16 columns

[18]:
cols = [
    'id',
    'name',
    'id_province',
    'province',
    'id_department',
    'department',
    'id_location',
    'location',
    'postal_code',
    'adress',
    'webpage',
]
[19]:
df = df[cols]
df
[19]:
id name id_province province id_department department id_location location postal_code adress webpage
0 70260072329721 PUJOL 26 CHUBUT 7 BIEDMA 26007020000 PUERTO MADRYN 9120 LOS MAITENES 1538 NaN
1 70100352324743 SALUDFARMA 10 CATAMARCA 35 BELÉN 10035020000 BELEN 4750 Mitre 14 NaN
2 70064412318286 MARSICO 6 BUENOS AIRES 441 LA PLATA 6441030015 LA PLATA 1900 Avenida 7 445 NaN
3 70340492347884 PERALTA HNOS 34 FORMOSA 49 PILCOMAYO 34049010000 CLORINDA 3610 Avenida San Martin 470 NaN
4 70140142334991 VALENTINO 14 CÓRDOBA 14 CAPITAL 14014010000 CORDOBA 5000 Catamarca 1047 NaN
... ... ... ... ... ... ... ... ... ... ... ...
13672 70460212355713 FARMACOST 46 LA RIOJA 21 CASTRO BARROS 46021020000 ANILLACO 5301 Coronel Nicolas Barros s/n Barrio Virgen del F... NaN
13673 70421472354613 FARMACIA MUSSO 42 LA PAMPA 147 TRENEL 42147030000 TRENEL 6369 9 DE JULIO 545 NaN
13674 70940142195567 FARMACIA- HOSPITAL REGIONAL USHUAIA 94 TIERRA DEL FUEGO 14 USHUAIA 94014020000 USHUAIA 9410 12 de octubre 65 NaN
13675 70420702154608 FARMACIA INSTITUCIONAL E.A. MANUEL FREIRE 42 LA PAMPA 70 GUATRACHÉ 42070030000 GUATRACHE 6309 Primera Junta Nº 158 NaN
13676 70064272320083 CABRERA ESCOBAR 6 BUENOS AIRES 427 LA MATANZA 6427010008 RAFAEL CASTILLO 1755 INT.FEDERICO P.RUSSO 1426 NaN

13677 rows × 11 columns

1.4. Load

[20]:
# create a path string
file_path = file_path_crib.format(
    category=farmacias["name"],
    year=now.year,
    month=now.month,
    day=now.day,
    province="ETL-Argentina"
)
farmacias_path = BASE_FILE_DIR / file_path
df.to_csv(farmacias_path)

1.5. Data Analisys Example

1.5.1. Total Pharmacies by Provinces

[21]:
provinces = df['province'].value_counts().keys()
total_farmacias = df['province'].value_counts().values
[21]:
Index(['BUENOS AIRES', 'SANTA FE', 'CABA', 'TUCUMÁN', 'MISIONES', 'CÓRDOBA',
       'ENTRE RÍOS', 'CHACO', 'SALTA', 'CORRIENTES', 'RÍO NEGRO', 'LA PAMPA',
       'SANTIAGO DEL ESTERO', 'SAN LUIS', 'SAN JUAN', 'NEUQUÉN', 'CHUBUT',
       'JUJUY', 'CATAMARCA', 'FORMOSA', 'LA RIOJA', 'SANTA CRUZ',
       'TIERRA DEL FUEGO', 'MENDOZA'],
      dtype='object')
[22]:
df_provincias = pd.DataFrame(
    {
        "Provinces": provinces,
        "Total_Pharms": total_farmacias,
        "Total_Pharms_in_percent": total_farmacias/ df.shape[0]
    }
)
df_provincias
[22]:
Provinces Total_Pharms Total_Pharms_in_percent
0 BUENOS AIRES 5053 0.369452
1 SANTA FE 1778 0.129999
2 CABA 1024 0.074870
3 TUCUMÁN 611 0.044674
4 MISIONES 594 0.043431
5 CÓRDOBA 566 0.041383
6 ENTRE RÍOS 489 0.035753
7 CHACO 426 0.031147
8 SALTA 337 0.024640
9 CORRIENTES 303 0.022154
10 RÍO NEGRO 273 0.019961
11 LA PAMPA 262 0.019156
12 SANTIAGO DEL ESTERO 251 0.018352
13 SAN LUIS 238 0.017401
14 SAN JUAN 228 0.016670
15 NEUQUÉN 192 0.014038
16 CHUBUT 185 0.013526
17 JUJUY 184 0.013453
18 CATAMARCA 180 0.013161
19 FORMOSA 162 0.011845
20 LA RIOJA 155 0.011333
21 SANTA CRUZ 93 0.006800
22 TIERRA DEL FUEGO 62 0.004533
23 MENDOZA 31 0.002267
[23]:
# barplot of all the features
fig = plt.figure(figsize = (12,4))
sns.barplot(x = 'Provinces', y = 'Total_Pharms', data = df_provincias)
plt.xticks(rotation=90)
plt.grid()
plt.title("Total Pharmacies by Province")
plt.show()
../_images/notes_data_exploratory_en_33_0.png
[24]:
fig = plt.figure(figsize = (12,4))
sns.barplot(x = 'Provinces', y = 'Total_Pharms_in_percent', data = df_provincias)
plt.xticks(rotation=90)
plt.grid()
plt.title("Total Pharms by Province in percent")
plt.show()
../_images/notes_data_exploratory_en_34_0.png

1.5.2. Filter Data by Province

[25]:
# Get pharms by province
def pharms_by_province(province):
    df_province =  df[df["province"]==province]
    return df_province
[26]:
# Get all pharms by department in some province
def pharms_by_department(df_province):
    department = df_province['department'].value_counts()
    df_pharms_by_department = pd.DataFrame(
        {
            "Departments": department.keys(),
            "Total_Pharms_by_Department": department.values,
            "Total_Pharms_in_percent": department.values / department.values.sum()
        }
    )
    return df_pharms_by_department
[27]:
data_allprovinces = {index : pharms_by_province(index) for index in provinces}
data_alldepartment = {index : pharms_by_department(data_allprovinces[index]) for index in provinces}

[28]:
province_name_by_index = "CÓRDOBA"
df_province = data_allprovinces[province_name_by_index]# data for province "CÓRDOBA"
df_province.head()
[28]:
id name id_province province id_department department id_location location postal_code adress webpage
4 70140142334991 VALENTINO 14 CÓRDOBA 14 CAPITAL 14014010000 CORDOBA 5000 Catamarca 1047 NaN
129 70140142333750 HP FARMA V 14 CÓRDOBA 14 CAPITAL 14014010000 CORDOBA 5000 Elías Yofre 1050 NaN
135 70140422333755 FARMAVIDA IX 14 CÓRDOBA 42 GENERAL SAN MARTÍN 14042170000 VILLA MARIA 5900 Sarmiento 1966 NaN
177 70140212333758 GRAMIGNA 14 CÓRDOBA 21 COLÓN 14021200000 MENDIOLAZA 5107 Avenida Martín Tissera 2997 NaN
178 70140142333759 GENERAL PAZ NUEVOCENTRO 14 CÓRDOBA 14 CAPITAL 14014010000 CORDOBA 5000 Avenida Duarte Quirós 1400 NaN
[29]:
df_department=data_alldepartment[province_name_by_index]
df_department
[29]:
Departments Total_Pharms_by_Department Total_Pharms_in_percent
0 CAPITAL 295 0.521201
1 SAN JUSTO 38 0.067138
2 COLÓN 31 0.054770
3 PUNILLA 30 0.053004
4 GENERAL SAN MARTÍN 28 0.049470
5 RÍO CUARTO 25 0.044170
6 RÍO SEGUNDO 20 0.035336
7 SANTA MARÍA 16 0.028269
8 MARCOS JUÁREZ 11 0.019435
9 TERCERO ARRIBA 9 0.015901
10 CRUZ DEL EJE 9 0.015901
11 CALAMUCHITA 8 0.014134
12 UNIÓN 8 0.014134
13 JUÁREZ CELMAN 7 0.012367
14 RÍO PRIMERO 5 0.008834
15 SAN JAVIER 5 0.008834
16 ISCHILÍN 5 0.008834
17 PRESIDENTE ROQUE SÁENZ PEÑA 5 0.008834
18 SAN ALBERTO 3 0.005300
19 SOBREMONTE 2 0.003534
20 RÍO SECO 2 0.003534
21 GENERAL ROCA 1 0.001767
22 MINAS 1 0.001767
23 TOTORAL 1 0.001767
24 TULUMBA 1 0.001767
[30]:
plt.rcParams["font.size"] = "5.0"
for province_name_by_index in provinces:
    df_department = data_alldepartment[province_name_by_index]
    fig = plt.figure(figsize = (12,4))
    sns.barplot(x = 'Departments', y = 'Total_Pharms_by_Department', data = df_department)
    plt.xticks(rotation = 90)
    plt.rcParams["font.size"] = "10.0"
    plt.grid()
    plt.title(f"Total Pharms by Department in {province_name_by_index}")
    plt.show()
../_images/notes_data_exploratory_en_41_0.png
../_images/notes_data_exploratory_en_41_1.png
../_images/notes_data_exploratory_en_41_2.png
../_images/notes_data_exploratory_en_41_3.png
../_images/notes_data_exploratory_en_41_4.png
../_images/notes_data_exploratory_en_41_5.png
../_images/notes_data_exploratory_en_41_6.png
../_images/notes_data_exploratory_en_41_7.png
../_images/notes_data_exploratory_en_41_8.png
../_images/notes_data_exploratory_en_41_9.png
../_images/notes_data_exploratory_en_41_10.png
../_images/notes_data_exploratory_en_41_11.png
../_images/notes_data_exploratory_en_41_12.png
../_images/notes_data_exploratory_en_41_13.png
../_images/notes_data_exploratory_en_41_14.png
../_images/notes_data_exploratory_en_41_15.png
../_images/notes_data_exploratory_en_41_16.png
../_images/notes_data_exploratory_en_41_17.png
../_images/notes_data_exploratory_en_41_18.png
../_images/notes_data_exploratory_en_41_19.png
../_images/notes_data_exploratory_en_41_20.png
../_images/notes_data_exploratory_en_41_21.png
../_images/notes_data_exploratory_en_41_22.png
../_images/notes_data_exploratory_en_41_23.png
[31]:
# barplot of all the features
plt.rcParams["font.size"] = "5.0"
for province_name_by_index in provinces:
    df_department = data_alldepartment[province_name_by_index]
    fig = plt.figure(figsize = (12,4))
    sns.barplot(x = 'Departments', y = 'Total_Pharms_in_percent', data = df_department)
    plt.xticks(rotation = 90)
    plt.grid()
    plt.rcParams["font.size"] = "10.0"
    plt.title(f"Total Pharms by Department in {province_name_by_index}")
    plt.show()

../_images/notes_data_exploratory_en_42_0.png
../_images/notes_data_exploratory_en_42_1.png
../_images/notes_data_exploratory_en_42_2.png
../_images/notes_data_exploratory_en_42_3.png
../_images/notes_data_exploratory_en_42_4.png
../_images/notes_data_exploratory_en_42_5.png
../_images/notes_data_exploratory_en_42_6.png
../_images/notes_data_exploratory_en_42_7.png
../_images/notes_data_exploratory_en_42_8.png
../_images/notes_data_exploratory_en_42_9.png
../_images/notes_data_exploratory_en_42_10.png
../_images/notes_data_exploratory_en_42_11.png
../_images/notes_data_exploratory_en_42_12.png
../_images/notes_data_exploratory_en_42_13.png
../_images/notes_data_exploratory_en_42_14.png
../_images/notes_data_exploratory_en_42_15.png
../_images/notes_data_exploratory_en_42_16.png
../_images/notes_data_exploratory_en_42_17.png
../_images/notes_data_exploratory_en_42_18.png
../_images/notes_data_exploratory_en_42_19.png
../_images/notes_data_exploratory_en_42_20.png
../_images/notes_data_exploratory_en_42_21.png
../_images/notes_data_exploratory_en_42_22.png
../_images/notes_data_exploratory_en_42_23.png

Derive Additional Features

[32]:
for province_name_by_index in provinces:
    df_province = data_allprovinces[province_name_by_index]# data for province "CÓRDOBA"
    file_path = file_path_crib.format(
        category = farmacias["name"],
        year = now.year,
        month = now.month,
        day = now.day,
        province = f"pharmacies_{province_name_by_index.lower().replace(' ', '_')}",
    )
    province_path = BASE_FILE_DIR / file_path
    # save data frame in CSV file
    df_province.to_csv(province_path)