Source code for pymacies_arg.core

# !/usr/bin/env python
# -*- coding: utf-8 -*-

# This file is part of the PymaciesArg Project
#     https://github.com/juniors90/PymaciesArg.
#
# Copyright (c) 2022. Ferreira Juan David
# License: MIT
#   Full Text: https://github.com/juniors90/PymaciesArg/blob/main/LICENSE

# =============================================================================
# DOCS
# =============================================================================

"""
PymaciesArg.

An extension that registers all pharmacies in Argentina.
"""

# =============================================================================
# IMPORTS
# =============================================================================

import logging
from datetime import datetime
from pathlib import Path
from typing import Dict, List

import pandas as pd


from .constants import farmacias_ds
from .extractor import UrlExtractor
from .transform import Transform

log = logging.getLogger()

data_extractors = {
    "pharmacies": UrlExtractor(farmacias_ds["name"], farmacias_ds["url"]),
}


[docs]def extract_raws(date_str: str, base_file_dir: Path) -> Dict[str, Path]:
    """
    Read files from `source <datos.gob.ar>`_ and extract the data.

    Create a dataframe with the data and rewrite headers format.
    Save all dataframes as `.csv` file.

    Parameters
    ----------
    date_str : str
        The date on run with format YYYY-mm-dd.
    base_file_dir : Path
            A base file directory.

    Return
    ------
    file_paths : dict[str]
        A dict of stored data file paths.
    """
    file_paths = dict()
    for name, extractor in data_extractors.items():
        file_path = extractor.extract(
            date_str=date_str, base_file_dir=base_file_dir
        )
        file_paths[name] = file_path

    return file_paths


[docs]def trasform_raws(
    date_str: str, file_paths: Path, province: str, base_file_dir: Path
) -> List[Path]:
    """
    Read files from `source <datos.gob.ar>`_ and extract the data.

    Create a dataframe with the data and rewrite headers format.
    Save all dataframes as `.csv` file.

    Parameters
    ----------
    date_str : str
        The date on run with format YYYY-mm-dd.
    file_paths : str
        The destination location.
    province : str
        The province name in UPPERCASE.
    base_file_dir : Path
        A base file directory.
    Return
    ------
    data_paths : list[str]
        The destination location of data trasform.
    """
    for name, extractor in data_extractors.items():
        df = pd.read_csv(file_paths[name])
        trasform = Transform()
        dft = trasform.transform(df)

    df = dft[dft["province"] == province]

    df_fixed = df[
        [
            "id",
            "name",
            "id_location",
            "id_department",
            "postal_code",
            "adress",
        ]
    ].set_index("id")

    df_localidades = (
        df.groupby(["id_location", "location"], as_index=False)
        .count()[["id_location", "location"]]
        .set_index("id_location")
    )

    df_departamentos = (
        df.groupby(["id_department", "department"], as_index=False)
        .count()[["id_department", "department"]]
        .set_index("id_department")
    )

    date = datetime.strptime(date_str, "%Y-%m-%d").date()
    file_path_crib = (
        "data"
        + "/{full_category}"
        + "/{year}-{month:02d}"
        + "/{category}"
        + "/{full_category}-{day:02d}-{month:02d}-{year}.csv"
    )  # noqa: E501
    data_paths = []
    for name in [
        f"pharmacies_{province.lower().replace(' ', '_')}",
        f"locations_{province.lower().replace(' ', '_')}",
        f"departments_{province.lower().replace(' ', '_')}",
    ]:
        full_category = name.split("_")
        category = "_".join(full_category[1:])
        file_path = file_path_crib.format(
            full_category=full_category[0],
            category=category,
            year=date.year,
            month=date.month,
            day=date.day,
        )

        f_path = base_file_dir / file_path
        data_paths.append(f_path)
        f_path.parent.mkdir(parents=True, exist_ok=True)

    df_fixed.to_csv(data_paths[0])
    df_localidades.to_csv(data_paths[1])
    df_departamentos.to_csv(data_paths[2])
    return data_paths


[docs]class PymaciesArg:
    """Extension class for different of PymaciesArg versions.

    Initilize the extension in `pipeline.py`::

        import datetime
        import os
        import pathlib

        from pymacies_arg import (
            PymaciesArg,
            PharmaciesLoader,
            LocationsLoader,
            DepartmentsLoader,
        )

        from sqlalchemy import create_engine

        # this path is pointing to project/
        PATH = os.path.abspath(os.path.dirname(__file__))

        SQLALCHEMY_DATABASE_URI = "sqlite:///" + PATH + "db_data.db"

        engine = create_engine(SQLALCHEMY_DATABASE_URI)

        now = datetime.datetime.now()
        date = f"{now.year}-{now.month}-{now.day}"

        pymacies = PymaciesArg(date, pathlib.Path(PATH))

        # Extract
        file_paths = pymacies.extract_raws()

        # Transform
        provinces = [
            "BUENOS AIRES",
            "SANTA FE",
            "CABA",
            "TUCUMÁN",
            "MISIONES",
            "CÓRDOBA",
            "ENTRE RÍOS",
            "CHACO",
            "SALTA",
            "CORRIENTES",
            "RÍO NEGRO",
            "LA PAMPA",
            "SANTIAGO DEL ESTERO",
            "SAN LUIS",
            "SAN JUAN",
            "NEUQUÉN",
            "CHUBUT",
            "JUJUY",
            "CATAMARCA",
            "FORMOSA",
            "LA RIOJA",
            "SANTA CRUZ",
            "TIERRA DEL FUEGO",
            "MENDOZA",
        ]
        paths = [
            pymacies.trasform_raws(file_paths, p) for p in provinces
        ]

        # Load
        for path in paths:
            PharmaciesLoader(engine).load_table(path[0])
            LocationsLoader(engine).load_table(path[1])
            DepartmentsLoader(engine).load_table(path[2])

    Attributes
    ----------
    date_str : str
        The date on run with format YYYY-mm-dd.
    base_file_dir : Path
            A base file directory.
    """

    def __init__(self, date_str: str, base_file_dir: Path) -> None:
        self.date_str = date_str
        self.base_file_dir = base_file_dir

[docs]    def extract_raws(self) -> Dict[str, Path]:
        """
        Read files from `source <datos.gob.ar>`_ and extract the data.

        Create a dataframe with the data and rewrite headers format.
        Save all dataframes as `.csv` file.

        Return
        ------
        file_paths : dict[str,  Path]
            A dict of stored data file paths.
        """
        file_paths = extract_raws(self.date_str, self.base_file_dir)
        return file_paths

[docs]    def trasform_raws(self, file_paths: Path, province: str) -> List[Path]:
        """
        Read files from `source <datos.gob.ar>`_ and extract the data.

        Create a dataframe with the data and rewrite headers format.
        Save all dataframes as `.csv` file.

        Parameters
        ----------
        file_paths : str
            The destination location.
        province : str
            The province name in UPPERCASE.

        Return
        ------
        data_paths : list[Path]
            The destination location of data trasform.
        """
        data_paths = trasform_raws(
            self.date_str, file_paths, province, self.base_file_dir
        )
        return data_paths