Source code for pymacies_arg.extractor
# !/usr/bin/env python
# -*- coding: utf-8 -*-
# This file is part of the PymaciesArg Project
# https://github.com/juniors90/PymaciesArg.
#
# Copyright (c) 2022. Ferreira Juan David
# License: MIT
# Full Text: https://github.com/juniors90/PymaciesArg/blob/main/LICENSE
# =============================================================================
# DOCS
# =============================================================================
"""
PymaciesArg.
An extension that registers all pharmacies in Argentina.
"""
# =============================================================================
# IMPORTS
# =============================================================================
import logging
from datetime import datetime
from pathlib import Path
import requests
log = logging.getLogger()
[docs]class UrlExtractor(object):
r"""Collapse your data into a single data frame.
Parameters
----------
name: str
The name of data (in this case, pharmacies) to extract.
url : str
Describe the url such that allows then data.
Return
------
An instance of ``UrlExtractor`` containing two methods named
``extract()`` and ``trasform()`` for all the information of
pharmacies in Córdoba.
Examples
--------
>>> import pathlib
>>> from pymacies_arg import UrlExtractor, Transform, trasform_raws
>>> name="pharmacies"
>>> url="http://datos.salud.gob.ar/dataset\
... /39117f8f-e2bc-4571-a572-15a6ce7ea9e1\
... /resource/19338ea7-a492-4af3-b212-18f8f4af9184\
... /download/establecimientos-farmacias-enero-2021.csv"
>>> url_extractor=UrlExtractor(name=name, url=url)
>>> url_extractor.__repr__()
'<Extractor for Name: farmacias, URL: <long_url>>'
>>> base_file_dir=pathlib.Path("/path/to/project")
>>> path_to_csv = url_extractor.extract(
... date_str="2022-03-28", base_file_dir=base_file_dir)
>>> path_to_csv
PosixPath('/path/to/project/data/pharmacies/2022-03/pharmacies-28-03-2022.csv')
>>> import pandas as pd
>>> df = pd.read_csv(path_to_csv)
>>> data_transform = Transform()
>>> data_transform.transform(df)
id ... web
0 70260072329721 ... NaN
1 70100352324743 ... NaN
2 70064412318286 ... NaN
3 70340492347884 ... NaN
4 70140142334991 ... NaN
... ... ... ...
13672 70460212355713 ... NaN
13673 70421472354613 ... NaN
13674 70940142195567 ... NaN
13675 70420702154608 ... NaN
13676 70064272320083 ... NaN
[13677 rows x 11 columns]
>>> trasform_raws(date_str, file_paths, province, base_file_dir)
"""
file_path_crib = "data/{category}/{year}-{month:02d}/{category}-{day:02d}-{month:02d}-{year}.csv" # noqa: E501
def __init__(self, name, url) -> None:
self.name = name
self.url = url
def __repr__(self) -> None:
"""Print a representation of your object."""
extractor = "<Extractor for Name: {name}, URL: {url}>"
return extractor.format(name=self.name, url=self.url)
[docs] def extract(self, date_str: str, base_file_dir: Path) -> str:
"""Extract your data into a single csv file.
Inspect the ``.csv`` and extract with data related
whit pharmmacies.
Parameters
----------
date_str : str
The date on run with format YYYY-mm-dd.
Return
------
file_path : str
The destination location for your csv file.
"""
log.info(f"Extracting {self.name}")
date = datetime.strptime(date_str, "%Y-%m-%d").date()
file_path = self.file_path_crib.format(
category=self.name, year=date.year, month=date.month, day=date.day
)
pharm_path = base_file_dir / file_path
pharm_path.parent.mkdir(parents=True, exist_ok=True)
r = requests.get(self.url)
r.encoding = "utf-8"
log.info(f"Storing file in {pharm_path}")
with open(pharm_path, "w") as f:
f.write(r.text)
return pharm_path