Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fomat code with ruff #130

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .git-blame-ignore-revs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Initital code format with ruff
12a8f0e39c6d9d232fe9e143e717dad1938987b6
51 changes: 27 additions & 24 deletions jedeschule/old/jugendforscht.py
Original file line number Diff line number Diff line change
@@ -1,57 +1,60 @@
import scrapy
from scrapy.shell import inspect_response


class SachsenSpider(scrapy.Spider):
name = "jugendforscht"
base_url = "http://jugend-forscht.bmbfcluster.de"
list = "&V=list#mpl"

start_urls = ['http://jugend-forscht.bmbfcluster.de/index.php?M=445&PID=19']
start_urls = ["http://jugend-forscht.bmbfcluster.de/index.php?M=445&PID=19"]

def parse(self, response):
#inspect_response(response, self)
# inspect_response(response, self)
for li in response.css(".contextcontent li"):
link = li.css('a::attr(href)').extract_first()
request = scrapy.Request(self.base_url + link + self.list, callback=self.parse_state)
link = li.css("a::attr(href)").extract_first()
request = scrapy.Request(
self.base_url + link + self.list, callback=self.parse_state
)
yield request

def parse_state(self, response):
#inspect_response(response, self)
for li in response.css('.geo_list li'):
link = li.css('a::attr(href)').extract_first()
# inspect_response(response, self)
for li in response.css(".geo_list li"):
link = li.css("a::attr(href)").extract_first()
request = scrapy.Request(self.base_url + link, callback=self.parse_locality)
yield request

def parse_locality(self, response):
#inspect_response(response, self)
for li in response.css('.geo_list li'):
link = li.css('a::attr(href)').extract_first()
# inspect_response(response, self)
for li in response.css(".geo_list li"):
link = li.css("a::attr(href)").extract_first()
request = scrapy.Request(self.base_url + link, callback=self.parse_school)
yield request

def parse_school(self, response):
for li in response.css('.geo_list li'):
link = li.css('a::attr(href)').extract_first()
for li in response.css(".geo_list li"):
link = li.css("a::attr(href)").extract_first()
request = scrapy.Request(self.base_url + link, callback=self.parse_item)
yield request

def parse_item(self, response):
#inspect_response(response, self)
# inspect_response(response, self)
collection = {}
h4 = response.css('.even h4')
p = response.css('.even p')
h4 = response.css(".even h4")
p = response.css(".even p")

response.h4 = h4
response.p = p

#inspect_response(response, self)
# inspect_response(response, self)

if (len(h4) > 0):
collection['Schule'] = h4[0].css('::text').extract_first()
if (len(p) > 0):
collection['Ort'] = p[0].css('::text').extract_first()
if (len(h4) > 1):
collection['Wettbewerb'] = h4[1].css('::text').extract_first()
if (len(p) > 1):
collection['partner'] = p[1].css('::text').extract_first()
if len(h4) > 0:
collection["Schule"] = h4[0].css("::text").extract_first()
if len(p) > 0:
collection["Ort"] = p[0].css("::text").extract_first()
if len(h4) > 1:
collection["Wettbewerb"] = h4[1].css("::text").extract_first()
if len(p) > 1:
collection["partner"] = p[1].css("::text").extract_first()
return collection
45 changes: 25 additions & 20 deletions jedeschule/old/klimaschutzschulenatlas.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,38 +5,43 @@

class KlimaschutzSchulenAtlasSpider(scrapy.Spider):
name = "klimaschutzschulenatlas"
start_urls = ['https://www.klimaschutzschulenatlas.de/der-atlas']
start_urls = ["https://www.klimaschutzschulenatlas.de/der-atlas"]

def parse(self, response):
#inspect_response(response, self)
# inspect_response(response, self)
yield scrapy.FormRequest.from_response(
response, callback=self.parse_projectlist)
response, callback=self.parse_projectlist
)

def parse_projectlist(self, response):
#inspect_response(response, self)
# inspect_response(response, self)
schoollinks = response.css(".media-body > a::attr(href)").extract()
for link in schoollinks:
yield scrapy.Request('https://www.klimaschutzschulenatlas.de' + link,
callback=self.parse_school)
yield scrapy.Request(
"https://www.klimaschutzschulenatlas.de" + link,
callback=self.parse_school,
)
if len(schoollinks) == 16:
next_page = response.css('.pagination a::attr(href)').extract()[-2]
yield scrapy.Request('https://www.klimaschutzschulenatlas.de' + next_page,
callback=self.parse_projectlist)
next_page = response.css(".pagination a::attr(href)").extract()[-2]
yield scrapy.Request(
"https://www.klimaschutzschulenatlas.de" + next_page,
callback=self.parse_projectlist,
)

def parse_school(self, response):
#inspect_response(response, self)
# inspect_response(response, self)
school = {}
school_information = response.css('.school-info li::text').extract()
school['type'] = school_information[0] if len(school_information) > 0 else ''
school['state'] = school_information[1] if len(school_information) > 1 else ''
school['street'] = school_information[2] if len(school_information) > 2 else ''
school_information = response.css(".school-info li::text").extract()
school["type"] = school_information[0] if len(school_information) > 0 else ""
school["state"] = school_information[1] if len(school_information) > 1 else ""
school["street"] = school_information[2] if len(school_information) > 2 else ""
if len(school_information) > 4:
address_information = school_information[3].strip().split(' ')
school['plz'] = address_information[0]
school['place'] = address_information[1]
address_information = school_information[3].strip().split(" ")
school["plz"] = address_information[0]
school["place"] = address_information[1]

projects = response.css('.col-xs-6 a::attr(title)').extract()
projects = response.css(".col-xs-6 a::attr(title)").extract()

for project in projects:
school['project'] = project
yield school
school["project"] = project
yield school
65 changes: 40 additions & 25 deletions jedeschule/old/schule-gegen-rassisum.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,39 +5,54 @@

class SchuleGegenRassismusSpider(scrapy.Spider):
name = "schule-gegen-rassismus"
start_urls = ['http://www.schule-ohne-rassismus.org/courage-schulen/alle-courage-schulen/']
start_urls = [
"http://www.schule-ohne-rassismus.org/courage-schulen/alle-courage-schulen/"
]

def parse(self, response):
schoolcards = response.css(".news-list-item")
for schoolcard in schoolcards:
school = {}
link = schoolcard.css('#schoolcard_name a')
school['name'] = link.css('::text').extract_first().strip()
school['link'] = link.css('::attr(href)').extract_first().strip()
godfather = schoolcard.css('#schoolcard_godparent p::text').extract_first().split(':')
school['pate'] = godfather[1] if len(godfather) > 1 else godfather[0]
school['date'] = schoolcard.css('#schoolcard_title .news-list-date::text').extract_first().strip()
school['category'] = schoolcard.css('#schoolcard_legend::text').extract_first().strip()
yield scrapy.Request('http://www.schule-ohne-rassismus.org/' + school['link'],
meta= {'school': school},
callback=self.parse_detail)
if (len(schoolcards) == 20):
next = response.css("div.news-list-browse a:contains('chste')::attr(href)").extract_first()
request = scrapy.Request('http://www.schule-ohne-rassismus.org/' + next,
callback=self.parse)
link = schoolcard.css("#schoolcard_name a")
school["name"] = link.css("::text").extract_first().strip()
school["link"] = link.css("::attr(href)").extract_first().strip()
godfather = (
schoolcard.css("#schoolcard_godparent p::text")
.extract_first()
.split(":")
)
school["pate"] = godfather[1] if len(godfather) > 1 else godfather[0]
school["date"] = (
schoolcard.css("#schoolcard_title .news-list-date::text")
.extract_first()
.strip()
)
school["category"] = (
schoolcard.css("#schoolcard_legend::text").extract_first().strip()
)
yield scrapy.Request(
"http://www.schule-ohne-rassismus.org/" + school["link"],
meta={"school": school},
callback=self.parse_detail,
)
if len(schoolcards) == 20:
next = response.css(
"div.news-list-browse a:contains('chste')::attr(href)"
).extract_first()
request = scrapy.Request(
"http://www.schule-ohne-rassismus.org/" + next, callback=self.parse
)
yield request

def parse_detail(self, response):
school = response.meta['school']
school = response.meta["school"]

address = response.css('.news-single-item p::text').extract()
#inspect_response(response, self)
school['street'] = address[0]
if (len(address) > 1):
address2 = address[1].split(' ')
school['postcode'] = address2[0]
address = response.css(".news-single-item p::text").extract()
# inspect_response(response, self)
school["street"] = address[0]
if len(address) > 1:
address2 = address[1].split(" ")
school["postcode"] = address2[0]
address2.pop(0)
school['place'] = ' '.join(address2)
school["place"] = " ".join(address2)
yield school


1 change: 0 additions & 1 deletion jedeschule/pipelines/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from .jsonpipeline import JsonPipeline
from .school_pipeline import SchoolPipeline
from .db_pipeline import DatabasePipeline

24 changes: 14 additions & 10 deletions jedeschule/pipelines/db_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from __future__ import annotations # needed so that update_or_create can define School return type
from __future__ import (
annotations,
) # needed so that update_or_create can define School return type

import logging
import os
Expand All @@ -25,7 +27,7 @@ def get_session():


class School(Base):
__tablename__ = 'schools'
__tablename__ = "schools"
id = Column(String, primary_key=True)
name = Column(String)
address = Column(String)
Expand All @@ -41,28 +43,30 @@ class School(Base):
phone = Column(String)
director = Column(String)
raw = Column(JSON)
location = Column(Geometry('POINT'))
location = Column(Geometry("POINT"))

@staticmethod
def update_or_create(item: SchoolPipelineItem, session=None) -> School:
if not session:
session = get_session()

school_data = {**item.info}
school = session.query(School).get(item.info['id'])
latitude = school_data.pop('latitude', None)
longitude = school_data.pop('longitude', None)
school = session.query(School).get(item.info["id"])
latitude = school_data.pop("latitude", None)
longitude = school_data.pop("longitude", None)
if latitude is not None and longitude is not None:
location = WKTElement(f"POINT({longitude} {latitude})", srid=4326)
school_data['location'] = location
school_data["location"] = location
if school:
session.query(School).filter_by(id=item.info['id']).update({**school_data, 'raw': item.item})
session.query(School).filter_by(id=item.info["id"]).update(
{**school_data, "raw": item.item}
)
else:
school = School(**school_data, raw=item.item)
return school

def __str__(self):
return f'<School id={self.id}, name={self.name}>'
return f"<School id={self.id}, name={self.name}>"


class DatabasePipeline:
Expand All @@ -75,7 +79,7 @@ def process_item(self, item: SchoolPipelineItem, spider):
self.session.add(school)
self.session.commit()
except SQLAlchemyError as e:
logging.warning('Error when putting to DB')
logging.warning("Error when putting to DB")
logging.warning(e)
self.session.rollback()
return school
9 changes: 6 additions & 3 deletions jedeschule/pipelines/jsonpipeline.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
from scrapy.exporters import JsonItemExporter
import os


class JsonPipeline(object):
def open_spider(self, spider):
if not os.path.exists("data"):
os.makedirs("data")
self.file = open("data/" + spider.name + ".json", 'wb')
self.exporter = JsonItemExporter(self.file, encoding='utf-8', ensure_ascii=False)
self.file = open("data/" + spider.name + ".json", "wb")
self.exporter = JsonItemExporter(
self.file, encoding="utf-8", ensure_ascii=False
)
self.exporter.start_exporting()

def close_spider(self, spider):
Expand All @@ -15,4 +18,4 @@ def close_spider(self, spider):

def process_item(self, item, spider):
self.exporter.export_item(item)
return item
return item
Loading
Loading