diff --git a/data_collection/gazette/spiders/base/administracaopublica.py b/data_collection/gazette/spiders/base/administracaopublica.py new file mode 100644 index 000000000..02f772c80 --- /dev/null +++ b/data_collection/gazette/spiders/base/administracaopublica.py @@ -0,0 +1,58 @@ +import re +from datetime import datetime as dt + +from dateutil.rrule import DAILY, rrule +from scrapy import Request + +from gazette.items import Gazette +from gazette.spiders.base import BaseGazetteSpider + + +class BaseAdministracaoPublicaSpider(BaseGazetteSpider): + """ + Base spider for cities using the https://administracaopublica.com.br/diario-oficial?token= plataform. + Gazettes are also available in http://www.transparenciadministrativa.com.br/diario/diariov2.xhtml?token=. + """ + + allowed_domains = ["administracaopublica.com.br"] + + def start_requests(self): + dates = list( + rrule(freq=DAILY, interval=7, dtstart=self.start_date, until=self.end_date) + ) + dt_end_date = dt(self.end_date.year, self.end_date.month, self.end_date.day) + if dt_end_date not in dates: + dates.append(dt_end_date) + + for i in range(len(dates) - 1): + start = dates[i].strftime("%Y-%m-%d") + end = dates[i + 1].strftime("%Y-%m-%d") + yield Request( + f"https://www.administracaopublica.com.br/diario-oficial?token={self.token}&de={start}&ate={end}" + ) + + def parse(self, response): + gazettes = response.css('[class*="diario_item_diario__"]') + for gazette in gazettes: + if "Nenhum resultado encontrado" not in response.text: + href = gazette.css( + '[class*="generics_button_baixar__"]::attr(href)' + ).get() + pattern = gazette.css("::text").getall() + + match pattern: + case [edition, _, power, date, _]: + pass + case [edition, _, date, _]: + power = "" + power_dict = { + "EXECUTIVO": "executive", + "LEGISLATIVO": "legislative", + } + yield Gazette( + edition_number=re.search(r"(\d+)[-/]", edition).group(1), + date=dt.strptime(date, "%d/%m/%Y").date(), + file_urls=[f"https://www.administracaopublica.com.br{href}"], + is_extra_edition=power == "EXTRA", + power=power_dict.get(power, "executive_legislative"), + ) diff --git a/data_collection/gazette/spiders/ma/ma_nova_iorque.py b/data_collection/gazette/spiders/ma/ma_nova_iorque.py new file mode 100644 index 000000000..80b0ed0ff --- /dev/null +++ b/data_collection/gazette/spiders/ma/ma_nova_iorque.py @@ -0,0 +1,10 @@ +import datetime as dt + +from gazette.spiders.base.administracaopublica import BaseAdministracaoPublicaSpider + + +class MaNovaIorqueSpider(BaseAdministracaoPublicaSpider): + TERRITORY_ID = "2107308" + name = "ma_nova_iorque" + start_date = dt.date(2017, 2, 15) + token = "4f1cf16edf5d73feaad4fec2a03c7c9e1cf536aa" diff --git a/data_collection/gazette/spiders/ma/ma_peritoro.py b/data_collection/gazette/spiders/ma/ma_peritoro.py index e6e12f02d..9d9b8beb1 100644 --- a/data_collection/gazette/spiders/ma/ma_peritoro.py +++ b/data_collection/gazette/spiders/ma/ma_peritoro.py @@ -1,11 +1,10 @@ import datetime as dt -from gazette.spiders.base.aplus import BaseAplusSpider +from gazette.spiders.base.administracaopublica import BaseAdministracaoPublicaSpider -class MaPeritoroSpider(BaseAplusSpider): +class MaPeritoroSpider(BaseAdministracaoPublicaSpider): TERRITORY_ID = "2108454" name = "ma_peritoro" - start_date = dt.date(2020, 1, 4) - allowed_domains = ["peritoro.ma.gov.br"] - url_base = "https://www.peritoro.ma.gov.br/diario/" + start_date = dt.date(2017, 1, 2) + token = "9de645b503b922df799865ffcb07a6ec7b9cb53e" diff --git a/data_collection/gazette/spiders/ma/ma_turilandia.py b/data_collection/gazette/spiders/ma/ma_turilandia.py new file mode 100644 index 000000000..2f229b65b --- /dev/null +++ b/data_collection/gazette/spiders/ma/ma_turilandia.py @@ -0,0 +1,10 @@ +import datetime as dt + +from gazette.spiders.base.administracaopublica import BaseAdministracaoPublicaSpider + + +class MaTurilandiaSpider(BaseAdministracaoPublicaSpider): + TERRITORY_ID = "2112456" + name = "ma_turilandia" + start_date = dt.date(2021, 3, 11) + token = "9664abfc624b73571a05e874f98fd6d114834924"