Source code for pymacies_arg.core

# !/usr/bin/env python
# -*- coding: utf-8 -*-

# This file is part of the PymaciesArg Project
#     https://github.com/juniors90/PymaciesArg.
#
# Copyright (c) 2022. Ferreira Juan David
# License: MIT
#   Full Text: https://github.com/juniors90/PymaciesArg/blob/main/LICENSE

# =============================================================================
# DOCS
# =============================================================================

"""
PymaciesArg.

An extension that registers all pharmacies in Argentina.
"""

# =============================================================================
# IMPORTS
# =============================================================================

import logging
from datetime import datetime
from pathlib import Path
from typing import Dict, List

import pandas as pd


from .constants import farmacias_ds
from .extractor import UrlExtractor
from .transform import Transform

log = logging.getLogger()

data_extractors = {
    "pharmacies": UrlExtractor(farmacias_ds["name"], farmacias_ds["url"]),
}


[docs]def extract_raws(date_str: str, base_file_dir: Path) -> Dict[str, Path]: """ Read files from `source <datos.gob.ar>`_ and extract the data. Create a dataframe with the data and rewrite headers format. Save all dataframes as `.csv` file. Parameters ---------- date_str : str The date on run with format YYYY-mm-dd. base_file_dir : Path A base file directory. Return ------ file_paths : dict[str] A dict of stored data file paths. """ file_paths = dict() for name, extractor in data_extractors.items(): file_path = extractor.extract( date_str=date_str, base_file_dir=base_file_dir ) file_paths[name] = file_path return file_paths
[docs]def trasform_raws( date_str: str, file_paths: Path, province: str, base_file_dir: Path ) -> List[Path]: """ Read files from `source <datos.gob.ar>`_ and extract the data. Create a dataframe with the data and rewrite headers format. Save all dataframes as `.csv` file. Parameters ---------- date_str : str The date on run with format YYYY-mm-dd. file_paths : str The destination location. province : str The province name in UPPERCASE. base_file_dir : Path A base file directory. Return ------ data_paths : list[str] The destination location of data trasform. """ for name, extractor in data_extractors.items(): df = pd.read_csv(file_paths[name]) trasform = Transform() dft = trasform.transform(df) df = dft[dft["province"] == province] df_fixed = df[ [ "id", "name", "id_location", "id_department", "postal_code", "adress", ] ].set_index("id") df_localidades = ( df.groupby(["id_location", "location"], as_index=False) .count()[["id_location", "location"]] .set_index("id_location") ) df_departamentos = ( df.groupby(["id_department", "department"], as_index=False) .count()[["id_department", "department"]] .set_index("id_department") ) date = datetime.strptime(date_str, "%Y-%m-%d").date() file_path_crib = ( "data" + "/{full_category}" + "/{year}-{month:02d}" + "/{category}" + "/{full_category}-{day:02d}-{month:02d}-{year}.csv" ) # noqa: E501 data_paths = [] for name in [ f"pharmacies_{province.lower().replace(' ', '_')}", f"locations_{province.lower().replace(' ', '_')}", f"departments_{province.lower().replace(' ', '_')}", ]: full_category = name.split("_") category = "_".join(full_category[1:]) file_path = file_path_crib.format( full_category=full_category[0], category=category, year=date.year, month=date.month, day=date.day, ) f_path = base_file_dir / file_path data_paths.append(f_path) f_path.parent.mkdir(parents=True, exist_ok=True) df_fixed.to_csv(data_paths[0]) df_localidades.to_csv(data_paths[1]) df_departamentos.to_csv(data_paths[2]) return data_paths
[docs]class PymaciesArg: """Extension class for different of PymaciesArg versions. Initilize the extension in `pipeline.py`:: import datetime import os import pathlib from pymacies_arg import ( PymaciesArg, PharmaciesLoader, LocationsLoader, DepartmentsLoader, ) from sqlalchemy import create_engine # this path is pointing to project/ PATH = os.path.abspath(os.path.dirname(__file__)) SQLALCHEMY_DATABASE_URI = "sqlite:///" + PATH + "db_data.db" engine = create_engine(SQLALCHEMY_DATABASE_URI) now = datetime.datetime.now() date = f"{now.year}-{now.month}-{now.day}" pymacies = PymaciesArg(date, pathlib.Path(PATH)) # Extract file_paths = pymacies.extract_raws() # Transform provinces = [ "BUENOS AIRES", "SANTA FE", "CABA", "TUCUMÁN", "MISIONES", "CÓRDOBA", "ENTRE RÍOS", "CHACO", "SALTA", "CORRIENTES", "RÍO NEGRO", "LA PAMPA", "SANTIAGO DEL ESTERO", "SAN LUIS", "SAN JUAN", "NEUQUÉN", "CHUBUT", "JUJUY", "CATAMARCA", "FORMOSA", "LA RIOJA", "SANTA CRUZ", "TIERRA DEL FUEGO", "MENDOZA", ] paths = [ pymacies.trasform_raws(file_paths, p) for p in provinces ] # Load for path in paths: PharmaciesLoader(engine).load_table(path[0]) LocationsLoader(engine).load_table(path[1]) DepartmentsLoader(engine).load_table(path[2]) Attributes ---------- date_str : str The date on run with format YYYY-mm-dd. base_file_dir : Path A base file directory. """ def __init__(self, date_str: str, base_file_dir: Path) -> None: self.date_str = date_str self.base_file_dir = base_file_dir
[docs] def extract_raws(self) -> Dict[str, Path]: """ Read files from `source <datos.gob.ar>`_ and extract the data. Create a dataframe with the data and rewrite headers format. Save all dataframes as `.csv` file. Return ------ file_paths : dict[str, Path] A dict of stored data file paths. """ file_paths = extract_raws(self.date_str, self.base_file_dir) return file_paths
[docs] def trasform_raws(self, file_paths: Path, province: str) -> List[Path]: """ Read files from `source <datos.gob.ar>`_ and extract the data. Create a dataframe with the data and rewrite headers format. Save all dataframes as `.csv` file. Parameters ---------- file_paths : str The destination location. province : str The province name in UPPERCASE. Return ------ data_paths : list[Path] The destination location of data trasform. """ data_paths = trasform_raws( self.date_str, file_paths, province, self.base_file_dir ) return data_paths