Datenschule · k-nut · Apr 15, 2024 · Apr 15, 2024
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
@@ -0,0 +1,2 @@
+# Initital code format with ruff
+12a8f0e39c6d9d232fe9e143e717dad1938987b6
diff --git a/jedeschule/old/jugendforscht.py b/jedeschule/old/jugendforscht.py
@@ -1,57 +1,60 @@
 import scrapy
 from scrapy.shell import inspect_response
 
+
 class SachsenSpider(scrapy.Spider):
     name = "jugendforscht"
     base_url = "http://jugend-forscht.bmbfcluster.de"
     list = "&V=list#mpl"
 
-    start_urls = ['http://jugend-forscht.bmbfcluster.de/index.php?M=445&PID=19']
+    start_urls = ["http://jugend-forscht.bmbfcluster.de/index.php?M=445&PID=19"]
 
     def parse(self, response):
-        #inspect_response(response, self)
+        # inspect_response(response, self)
         for li in response.css(".contextcontent li"):
-            link = li.css('a::attr(href)').extract_first()
-            request = scrapy.Request(self.base_url + link + self.list, callback=self.parse_state)
+            link = li.css("a::attr(href)").extract_first()
+            request = scrapy.Request(
+                self.base_url + link + self.list, callback=self.parse_state
+            )
             yield request
 
     def parse_state(self, response):
-        #inspect_response(response, self)
-        for li in response.css('.geo_list li'):
-            link = li.css('a::attr(href)').extract_first()
+        # inspect_response(response, self)
+        for li in response.css(".geo_list li"):
+            link = li.css("a::attr(href)").extract_first()
             request = scrapy.Request(self.base_url + link, callback=self.parse_locality)
             yield request
 
     def parse_locality(self, response):
-        #inspect_response(response, self)
-        for li in response.css('.geo_list li'):
-            link = li.css('a::attr(href)').extract_first()
+        # inspect_response(response, self)
+        for li in response.css(".geo_list li"):
+            link = li.css("a::attr(href)").extract_first()
             request = scrapy.Request(self.base_url + link, callback=self.parse_school)
             yield request
 
     def parse_school(self, response):
-        for li in response.css('.geo_list li'):
-            link = li.css('a::attr(href)').extract_first()
+        for li in response.css(".geo_list li"):
+            link = li.css("a::attr(href)").extract_first()
             request = scrapy.Request(self.base_url + link, callback=self.parse_item)
             yield request
 
     def parse_item(self, response):
-        #inspect_response(response, self)
+        # inspect_response(response, self)
         collection = {}
-        h4 = response.css('.even h4')
-        p = response.css('.even p')
+        h4 = response.css(".even h4")
+        p = response.css(".even p")
 
         response.h4 = h4
         response.p = p
 
-        #inspect_response(response, self)
+        # inspect_response(response, self)
 
-        if (len(h4) > 0):
-            collection['Schule'] = h4[0].css('::text').extract_first()
-        if (len(p) > 0):
-            collection['Ort'] = p[0].css('::text').extract_first()
-        if (len(h4) > 1):
-            collection['Wettbewerb'] = h4[1].css('::text').extract_first()
-        if (len(p) > 1):
-            collection['partner'] = p[1].css('::text').extract_first()
+        if len(h4) > 0:
+            collection["Schule"] = h4[0].css("::text").extract_first()
+        if len(p) > 0:
+            collection["Ort"] = p[0].css("::text").extract_first()
+        if len(h4) > 1:
+            collection["Wettbewerb"] = h4[1].css("::text").extract_first()
+        if len(p) > 1:
+            collection["partner"] = p[1].css("::text").extract_first()
         return collection
diff --git a/jedeschule/old/klimaschutzschulenatlas.py b/jedeschule/old/klimaschutzschulenatlas.py
@@ -5,38 +5,43 @@
 
 class KlimaschutzSchulenAtlasSpider(scrapy.Spider):
     name = "klimaschutzschulenatlas"
-    start_urls = ['https://www.klimaschutzschulenatlas.de/der-atlas']
+    start_urls = ["https://www.klimaschutzschulenatlas.de/der-atlas"]
 
     def parse(self, response):
-        #inspect_response(response, self)
+        # inspect_response(response, self)
         yield scrapy.FormRequest.from_response(
-            response, callback=self.parse_projectlist)
+            response, callback=self.parse_projectlist
+        )
 
     def parse_projectlist(self, response):
-        #inspect_response(response, self)
+        # inspect_response(response, self)
         schoollinks = response.css(".media-body > a::attr(href)").extract()
         for link in schoollinks:
-            yield scrapy.Request('https://www.klimaschutzschulenatlas.de' + link,
-                                 callback=self.parse_school)
+            yield scrapy.Request(
+                "https://www.klimaschutzschulenatlas.de" + link,
+                callback=self.parse_school,
+            )
         if len(schoollinks) == 16:
-            next_page = response.css('.pagination a::attr(href)').extract()[-2]
-            yield scrapy.Request('https://www.klimaschutzschulenatlas.de' + next_page,
-                                 callback=self.parse_projectlist)
+            next_page = response.css(".pagination a::attr(href)").extract()[-2]
+            yield scrapy.Request(
+                "https://www.klimaschutzschulenatlas.de" + next_page,
+                callback=self.parse_projectlist,
+            )
 
     def parse_school(self, response):
-        #inspect_response(response, self)
+        # inspect_response(response, self)
         school = {}
-        school_information = response.css('.school-info li::text').extract()
-        school['type'] = school_information[0] if len(school_information) > 0 else ''
-        school['state'] = school_information[1] if len(school_information) > 1 else ''
-        school['street'] = school_information[2] if len(school_information) > 2 else ''
+        school_information = response.css(".school-info li::text").extract()
+        school["type"] = school_information[0] if len(school_information) > 0 else ""
+        school["state"] = school_information[1] if len(school_information) > 1 else ""
+        school["street"] = school_information[2] if len(school_information) > 2 else ""
         if len(school_information) > 4:
-            address_information = school_information[3].strip().split(' ')
-            school['plz'] = address_information[0]
-            school['place'] = address_information[1]
+            address_information = school_information[3].strip().split(" ")
+            school["plz"] = address_information[0]
+            school["place"] = address_information[1]
 
-        projects = response.css('.col-xs-6 a::attr(title)').extract()
+        projects = response.css(".col-xs-6 a::attr(title)").extract()
 
         for project in projects:
-            school['project'] = project
-            yield school
+            school["project"] = project
+            yield school
diff --git a/jedeschule/old/schule-gegen-rassisum.py b/jedeschule/old/schule-gegen-rassisum.py
@@ -5,39 +5,54 @@
 
 class SchuleGegenRassismusSpider(scrapy.Spider):
     name = "schule-gegen-rassismus"
-    start_urls = ['http://www.schule-ohne-rassismus.org/courage-schulen/alle-courage-schulen/']
+    start_urls = [
+        "http://www.schule-ohne-rassismus.org/courage-schulen/alle-courage-schulen/"
+    ]
 
     def parse(self, response):
         schoolcards = response.css(".news-list-item")
         for schoolcard in schoolcards:
             school = {}
-            link = schoolcard.css('#schoolcard_name a')
-            school['name'] = link.css('::text').extract_first().strip()
-            school['link'] = link.css('::attr(href)').extract_first().strip()
-            godfather = schoolcard.css('#schoolcard_godparent p::text').extract_first().split(':')
-            school['pate'] = godfather[1] if len(godfather) > 1 else godfather[0]
-            school['date'] = schoolcard.css('#schoolcard_title .news-list-date::text').extract_first().strip()
-            school['category'] = schoolcard.css('#schoolcard_legend::text').extract_first().strip()
-            yield scrapy.Request('http://www.schule-ohne-rassismus.org/' + school['link'],
-                                 meta= {'school': school},
-                                 callback=self.parse_detail)
-        if (len(schoolcards) == 20):
-            next = response.css("div.news-list-browse a:contains('chste')::attr(href)").extract_first()
-            request = scrapy.Request('http://www.schule-ohne-rassismus.org/' + next,
-                                 callback=self.parse)
+            link = schoolcard.css("#schoolcard_name a")
+            school["name"] = link.css("::text").extract_first().strip()
+            school["link"] = link.css("::attr(href)").extract_first().strip()
+            godfather = (
+                schoolcard.css("#schoolcard_godparent p::text")
+                .extract_first()
+                .split(":")
+            )
+            school["pate"] = godfather[1] if len(godfather) > 1 else godfather[0]
+            school["date"] = (
+                schoolcard.css("#schoolcard_title .news-list-date::text")
+                .extract_first()
+                .strip()
+            )
+            school["category"] = (
+                schoolcard.css("#schoolcard_legend::text").extract_first().strip()
+            )
+            yield scrapy.Request(
+                "http://www.schule-ohne-rassismus.org/" + school["link"],
+                meta={"school": school},
+                callback=self.parse_detail,
+            )
+        if len(schoolcards) == 20:
+            next = response.css(
+                "div.news-list-browse a:contains('chste')::attr(href)"
+            ).extract_first()
+            request = scrapy.Request(
+                "http://www.schule-ohne-rassismus.org/" + next, callback=self.parse
+            )
             yield request
 
     def parse_detail(self, response):
-        school = response.meta['school']
+        school = response.meta["school"]
 
-        address = response.css('.news-single-item p::text').extract()
-        #inspect_response(response, self)
-        school['street'] = address[0]
-        if (len(address) > 1):
-            address2 = address[1].split(' ')
-            school['postcode'] = address2[0]
+        address = response.css(".news-single-item p::text").extract()
+        # inspect_response(response, self)
+        school["street"] = address[0]
+        if len(address) > 1:
+            address2 = address[1].split(" ")
+            school["postcode"] = address2[0]
             address2.pop(0)
-            school['place'] = ' '.join(address2)
+            school["place"] = " ".join(address2)
         yield school
-
-
diff --git a/jedeschule/pipelines/__init__.py b/jedeschule/pipelines/__init__.py
@@ -1,4 +1,3 @@
 from .jsonpipeline import JsonPipeline
 from .school_pipeline import SchoolPipeline
 from .db_pipeline import DatabasePipeline
-
diff --git a/jedeschule/pipelines/db_pipeline.py b/jedeschule/pipelines/db_pipeline.py
@@ -1,4 +1,6 @@
-from __future__ import annotations  # needed so that update_or_create can define School return type
+from __future__ import (
+    annotations,
+)  # needed so that update_or_create can define School return type
 
 import logging
 import os
@@ -25,7 +27,7 @@ def get_session():
 
 
 class School(Base):
-    __tablename__ = 'schools'
+    __tablename__ = "schools"
     id = Column(String, primary_key=True)
     name = Column(String)
     address = Column(String)
@@ -41,28 +43,30 @@ class School(Base):
     phone = Column(String)
     director = Column(String)
     raw = Column(JSON)
-    location = Column(Geometry('POINT'))
+    location = Column(Geometry("POINT"))
 
     @staticmethod
     def update_or_create(item: SchoolPipelineItem, session=None) -> School:
         if not session:
             session = get_session()
 
         school_data = {**item.info}
-        school = session.query(School).get(item.info['id'])
-        latitude = school_data.pop('latitude', None)
-        longitude = school_data.pop('longitude', None)
+        school = session.query(School).get(item.info["id"])
+        latitude = school_data.pop("latitude", None)
+        longitude = school_data.pop("longitude", None)
         if latitude is not None and longitude is not None:
             location = WKTElement(f"POINT({longitude} {latitude})", srid=4326)
-            school_data['location'] = location
+            school_data["location"] = location
         if school:
-            session.query(School).filter_by(id=item.info['id']).update({**school_data, 'raw': item.item})
+            session.query(School).filter_by(id=item.info["id"]).update(
+                {**school_data, "raw": item.item}
+            )
         else:
             school = School(**school_data, raw=item.item)
         return school
 
     def __str__(self):
-        return f'<School id={self.id}, name={self.name}>'
+        return f"<School id={self.id}, name={self.name}>"
 
 
 class DatabasePipeline:
@@ -75,7 +79,7 @@ def process_item(self, item: SchoolPipelineItem, spider):
             self.session.add(school)
             self.session.commit()
         except SQLAlchemyError as e:
-            logging.warning('Error when putting to DB')
+            logging.warning("Error when putting to DB")
             logging.warning(e)
             self.session.rollback()
         return school
diff --git a/jedeschule/pipelines/jsonpipeline.py b/jedeschule/pipelines/jsonpipeline.py
@@ -1,12 +1,15 @@
 from scrapy.exporters import JsonItemExporter
 import os
 
+
 class JsonPipeline(object):
     def open_spider(self, spider):
         if not os.path.exists("data"):
             os.makedirs("data")
-        self.file = open("data/" + spider.name + ".json", 'wb')
-        self.exporter = JsonItemExporter(self.file, encoding='utf-8', ensure_ascii=False)
+        self.file = open("data/" + spider.name + ".json", "wb")
+        self.exporter = JsonItemExporter(
+            self.file, encoding="utf-8", ensure_ascii=False
+        )
         self.exporter.start_exporting()
 
     def close_spider(self, spider):
@@ -15,4 +18,4 @@ def close_spider(self, spider):
 
     def process_item(self, item, spider):
         self.exporter.export_item(item)
-        return item
+        return item
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Initital code format with ruff
		12a8f0e39c6d9d232fe9e143e717dad1938987b6