Skip to content

Commit

Permalink
sistema replicavel administracaopublica #1247 (#1297)
Browse files Browse the repository at this point in the history
  • Loading branch information
trevineju authored Jan 22, 2025
2 parents 888164d + ab08f52 commit cf5a2b4
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 5 deletions.
58 changes: 58 additions & 0 deletions data_collection/gazette/spiders/base/administracaopublica.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import re
from datetime import datetime as dt

from dateutil.rrule import DAILY, rrule
from scrapy import Request

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class BaseAdministracaoPublicaSpider(BaseGazetteSpider):
"""
Base spider for cities using the https://administracaopublica.com.br/diario-oficial?token= plataform.
Gazettes are also available in http://www.transparenciadministrativa.com.br/diario/diariov2.xhtml?token=.
"""

allowed_domains = ["administracaopublica.com.br"]

def start_requests(self):
dates = list(
rrule(freq=DAILY, interval=7, dtstart=self.start_date, until=self.end_date)
)
dt_end_date = dt(self.end_date.year, self.end_date.month, self.end_date.day)
if dt_end_date not in dates:
dates.append(dt_end_date)

for i in range(len(dates) - 1):
start = dates[i].strftime("%Y-%m-%d")
end = dates[i + 1].strftime("%Y-%m-%d")
yield Request(
f"https://www.administracaopublica.com.br/diario-oficial?token={self.token}&de={start}&ate={end}"
)

def parse(self, response):
gazettes = response.css('[class*="diario_item_diario__"]')
for gazette in gazettes:
if "Nenhum resultado encontrado" not in response.text:
href = gazette.css(
'[class*="generics_button_baixar__"]::attr(href)'
).get()
pattern = gazette.css("::text").getall()

match pattern:
case [edition, _, power, date, _]:
pass
case [edition, _, date, _]:
power = ""
power_dict = {
"EXECUTIVO": "executive",
"LEGISLATIVO": "legislative",
}
yield Gazette(
edition_number=re.search(r"(\d+)[-/]", edition).group(1),
date=dt.strptime(date, "%d/%m/%Y").date(),
file_urls=[f"https://www.administracaopublica.com.br{href}"],
is_extra_edition=power == "EXTRA",
power=power_dict.get(power, "executive_legislative"),
)
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/ma/ma_nova_iorque.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import datetime as dt

from gazette.spiders.base.administracaopublica import BaseAdministracaoPublicaSpider


class MaNovaIorqueSpider(BaseAdministracaoPublicaSpider):
TERRITORY_ID = "2107308"
name = "ma_nova_iorque"
start_date = dt.date(2017, 2, 15)
token = "4f1cf16edf5d73feaad4fec2a03c7c9e1cf536aa"
9 changes: 4 additions & 5 deletions data_collection/gazette/spiders/ma/ma_peritoro.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import datetime as dt

from gazette.spiders.base.aplus import BaseAplusSpider
from gazette.spiders.base.administracaopublica import BaseAdministracaoPublicaSpider


class MaPeritoroSpider(BaseAplusSpider):
class MaPeritoroSpider(BaseAdministracaoPublicaSpider):
TERRITORY_ID = "2108454"
name = "ma_peritoro"
start_date = dt.date(2020, 1, 4)
allowed_domains = ["peritoro.ma.gov.br"]
url_base = "https://www.peritoro.ma.gov.br/diario/"
start_date = dt.date(2017, 1, 2)
token = "9de645b503b922df799865ffcb07a6ec7b9cb53e"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/ma/ma_turilandia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import datetime as dt

from gazette.spiders.base.administracaopublica import BaseAdministracaoPublicaSpider


class MaTurilandiaSpider(BaseAdministracaoPublicaSpider):
TERRITORY_ID = "2112456"
name = "ma_turilandia"
start_date = dt.date(2021, 3, 11)
token = "9664abfc624b73571a05e874f98fd6d114834924"

0 comments on commit cf5a2b4

Please sign in to comment.