Skip to content

Commit

Permalink
sistema replicavel administracaopublica okfn-brasil#1247
Browse files Browse the repository at this point in the history
nova spider ma_turilandia

correcao sufixo em seletor administracaopublica

Remoção de trechos e imports sobre tipagem

Corrige erros de digitação

Corrige lógica de geração de datas

Troca forma de reconhecimento de intervalos vazios

Resolve casos de redirecionamento

Corrige reconhecimento de layout do site

Corrige coleta de metadados

Reduz intervalo de recorrência de datas
  • Loading branch information
almeidadm authored and trevineju committed Jan 21, 2025
1 parent 888164d commit 78ac312
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 5 deletions.
58 changes: 58 additions & 0 deletions data_collection/gazette/spiders/base/administracaopublica.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import re
from datetime import datetime as dt

from dateutil.rrule import DAILY, rrule
from scrapy import Request

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class BaseAdministracaoPublicaSpider(BaseGazetteSpider):
"""
Base spider for cities using the https://administracaopublica.com.br/diario-oficial?token= plataform.
Gazettes are also available in http://www.transparenciadministrativa.com.br/diario/diariov2.xhtml?token=.
"""

allowed_domains = ["administracaopublica.com.br"]

def start_requests(self):
dates = list(
rrule(freq=DAILY, interval=10, dtstart=self.start_date, until=self.end_date)
)
dt_end_date = dt(self.end_date.year, self.end_date.month, self.end_date.day)
if dt_end_date not in dates:
dates.append(dt_end_date)

for i in range(len(dates) - 1):
start = dates[i].strftime("%Y-%m-%d")
end = dates[i + 1].strftime("%Y-%m-%d")
yield Request(
f"https://www.administracaopublica.com.br/diario-oficial?token={self.token}&de={start}&ate={end}"
)

def parse(self, response):
gazettes = response.css('[class*="diario_item_diario__"]')
for gazette in gazettes:
if "Nenhum resultado encontrado" not in response.text:
href = gazette.css(
'[class*="generics_button_baixar__"]::attr(href)'
).get()
pattern = gazette.css("::text").getall()

match pattern:
case [edition, _, power, date, _]:
pass
case [edition, _, date, _]:
power = ""
power_dict = {
"EXECUTIVO": "executive",
"LEGISLATIVO": "legislative",
}
yield Gazette(
edition_number=re.search(r"(\d+)/", edition).group(1),
date=dt.strptime(date, "%d/%m/%Y").date(),
file_urls=[f"https://www.administracaopublica.com.br{href}"],
is_extra_edition=power == "EXTRA",
power=power_dict.get(power, "executive_legislative"),
)
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/ma/ma_nova_iorque.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import datetime as dt

from gazette.spiders.base.administracaopublica import BaseAdministracaoPublicaSpider


class MaNovaIorqueSpider(BaseAdministracaoPublicaSpider):
TERRITORY_ID = "2107308"
name = "ma_nova_iorque"
start_date = dt.date(2017, 2, 15)
token = "4f1cf16edf5d73feaad4fec2a03c7c9e1cf536aa"
9 changes: 4 additions & 5 deletions data_collection/gazette/spiders/ma/ma_peritoro.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import datetime as dt

from gazette.spiders.base.aplus import BaseAplusSpider
from gazette.spiders.base.administracaopublica import BaseAdministracaoPublicaSpider


class MaPeritoroSpider(BaseAplusSpider):
class MaPeritoroSpider(BaseAdministracaoPublicaSpider):
TERRITORY_ID = "2108454"
name = "ma_peritoro"
start_date = dt.date(2020, 1, 4)
allowed_domains = ["peritoro.ma.gov.br"]
url_base = "https://www.peritoro.ma.gov.br/diario/"
start_date = dt.date(2017, 1, 2)
token = "9de645b503b922df799865ffcb07a6ec7b9cb53e"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/ma/ma_turilandia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import datetime as dt

from gazette.spiders.base.administracaopublica import BaseAdministracaoPublicaSpider


class MaTurilandiaSpider(BaseAdministracaoPublicaSpider):
TERRITORY_ID = "2112456"
name = "ma_turilandia"
start_date = dt.date(2021, 3, 11)
token = "9664abfc624b73571a05e874f98fd6d114834924"

0 comments on commit 78ac312

Please sign in to comment.