From 5dd97bc10903271858c3a0ae4f6ebc11f91605fb Mon Sep 17 00:00:00 2001 From: Livio Bieri Date: Mon, 20 Jan 2020 22:57:52 +0800 Subject: [PATCH 1/7] feat: proof of concept for tutti.ch/immobilien --- tutti/spiders/immobilien.py | 66 +++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 tutti/spiders/immobilien.py diff --git a/tutti/spiders/immobilien.py b/tutti/spiders/immobilien.py new file mode 100644 index 0000000..90a86fd --- /dev/null +++ b/tutti/spiders/immobilien.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +import scrapy +import json +import re + + +class ImmobilienSpider(scrapy.Spider): + name = "immobilien" + + def __init__( + self, + pages=1, + searchterm=None, + object_type=None, + max_price=None, + min_sqm=None, + rooms=None, + **kwargs, + ): + super().__init__(**kwargs) + self.pages = int(pages) + self.searchterm = searchterm if searchterm else "" + self.object_type = object_type if object_type else "wohnungen" + self.max_price = max_price if max_price else "" + self.min_sqm = min_sqm if min_sqm else "" + self.rooms = rooms if rooms else "" + + def start_requests(self): + for page in range(1, self.pages + 1): + yield scrapy.Request( + callback=self.parse, + dont_filter=True, + url=f"https://www.tutti.ch/de/immobilien/objekttyp/{self.object_type}/standort/ort-{self.searchterm}/typ/mieten" + + f"?floor_area={self.min_sqm}&price=,{self.max_price}&rooms={self.rooms}&paging={page}", + ) + + def transform_raw(self, data): + return { + "id": data["id"], + "subject": data.get("subject"), + "body": data.get("body"), + "price": data.get("price"), + "time": data.get("epoch_time"), + "region": data.get("location_info", {}).get("region_name"), + "plz": data.get("location_info", {}).get("plz"), + "link": f"https://www.tutti.ch/vi/{data['id']}", + "thumbnail": f"https://c.tutti.ch/images/{data.get('thumb_name')}", + "images": [ + f"https://c.tutti.ch/images/{image}" + for image in data.get("image_names", []) + ], + "_meta": data, + } + + def parse(self, response): + pattern = re.compile(r"window.__INITIAL_STATE__=(.*)", re.MULTILINE | re.DOTALL) + + data = response.xpath('//script[contains(., "INITIAL_STATE")]/text()').re( + pattern + )[0] + + items = json.loads(data)["items"] + offers = reversed(sorted(items.items(), key=lambda item: item[1]["epoch_time"])) + + for _, offer in offers: + yield self.transform_raw(offer) From 34e3b2b043a5e048ae5fccc74b43b5fb32e27eec Mon Sep 17 00:00:00 2001 From: Livio Bieri Date: Sun, 26 Jan 2020 20:29:34 +0800 Subject: [PATCH 2/7] fix: prevent webhook for old items --- tutti/pipelines.py | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/tutti/pipelines.py b/tutti/pipelines.py index 7a55014..f670db0 100644 --- a/tutti/pipelines.py +++ b/tutti/pipelines.py @@ -3,34 +3,32 @@ from .utils import post_to_slack +NO_EXECUTATION_TIME = 0 + + class TuttiPipeline: def open_spider(self, spider): self.spider = spider - self.last_job_ids = self.get_last_job_ids() + self.last_job_executation_time = self.get_last_job_executation_time() def process_item(self, item, spider): - if item["id"] not in self.last_job_ids: + if self.last_job_executation_time < item["time"]: self.handle_webhooks(item) - return item - def get_last_job_ids(self): + def get_last_job_executation_time(self): project_id = os.environ.get("SCRAPY_PROJECT_ID") api_key = self.spider.settings.get("SCRAPINGHUB_API_KEY") if not project_id or not api_key: - return [] + return NO_EXECUTATION_TIME client = ScrapinghubClient(api_key) project = client.get_project(project_id) jobs = project.jobs.list() if not jobs: - return [] - - # find last job for spider searchterm same spider - # can be invoked with different searchterms - last_matching_job = None + return NO_EXECUTATION_TIME for each in jobs: key = each["key"] @@ -40,13 +38,9 @@ def get_last_job_ids(self): searchterm = metadata.get("spider_args", {}).get("searchterm", "") if self.spider.searchterm == searchterm: - last_matching_job = job - break - - if not last_matching_job: - return [] + return metadata["running_time"] - return [item["id"] for item in last_matching_job.items.iter()] + return NO_EXECUTATION_TIME def handle_webhooks(self, item): slack_webhook = self.spider.settings.get("SLACK_WEBHOOK") From d42221a29ca610671b6652d67156d01dafbd7578 Mon Sep 17 00:00:00 2001 From: Livio Bieri Date: Sun, 26 Jan 2020 20:58:32 +0800 Subject: [PATCH 3/7] feat: adds coordinates w/ google maps links --- tutti/spiders/immobilien.py | 1 + tutti/utils.py | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/tutti/spiders/immobilien.py b/tutti/spiders/immobilien.py index 90a86fd..0b74a39 100644 --- a/tutti/spiders/immobilien.py +++ b/tutti/spiders/immobilien.py @@ -42,6 +42,7 @@ def transform_raw(self, data): "price": data.get("price"), "time": data.get("epoch_time"), "region": data.get("location_info", {}).get("region_name"), + "coordinates": data.get("location"), "plz": data.get("location_info", {}).get("plz"), "link": f"https://www.tutti.ch/vi/{data['id']}", "thumbnail": f"https://c.tutti.ch/images/{data.get('thumb_name')}", diff --git a/tutti/utils.py b/tutti/utils.py index e667dd7..2ac0560 100644 --- a/tutti/utils.py +++ b/tutti/utils.py @@ -2,15 +2,24 @@ def post_to_slack(item, webhook): + coordinates = item.get("coordinates") + + if coordinates: + params = f"ll={coordinates['lat']},{coordinates['lon']}" + link = f"https://www.google.com/maps?{params}" + location = f":round_pushpin: <{link}|Region {item['region']}, {item['plz']}>\n" + else: + location = f":round_pushpin: Region {item['region']}, {item['plz']}\n" + payload = { "blocks": [ { "type": "section", "text": { "type": "mrkdwn", - "text": f"*<{item['link']}|{item['subject']}>*\n\n" - + f":round_pushpin: Region {item['region']}, {item['plz']}\n" - + f"*:heavy_dollar_sign: Price {item['price']}*", + "text": f"*<{item['link']}|{item['subject']}>*\n" + + location + + f"*:heavy_dollar_sign: {item['price']}*", }, "accessory": { "type": "image", From fbc5965692404aa57712b915add7fa787efab220 Mon Sep 17 00:00:00 2001 From: Livio Bieri Date: Mon, 19 Oct 2020 21:28:25 +0200 Subject: [PATCH 4/7] chore: fix requirements --- requirements.txt | 65 +++++++++++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 26 deletions(-) diff --git a/requirements.txt b/requirements.txt index 7755d24..385bd55 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,39 +1,52 @@ appdirs==1.4.3 +appnope==0.1.0 attrs==19.3.0 Automat==0.8.0 -black==19.10b0 -certifi==2019.11.28 -cffi==1.13.2 -chardet==3.0.4 -Click==7.0 +backcall==0.2.0 +black==20.8b1 +cffi==1.14.3 +click==7.1.2 constantly==15.1.0 -cryptography==2.8 +cryptography==3.1.1 cssselect==1.1.0 -hyperlink==19.0.0 -idna==2.8 +decorator==4.4.2 +hyperlink==20.0.1 +idna==2.10 incremental==17.5.0 -lxml==4.4.2 -msgpack==0.6.2 -parsel==1.5.2 -pathspec==0.7.0 +ipdb==0.13.4 +ipython==7.18.1 +ipython-genutils==0.2.0 +itemadapter==0.1.1 +itemloaders==1.0.3 +jedi==0.17.2 +jmespath==0.10.0 +lxml==4.6.1 +mypy-extensions==0.4.3 +parsel==1.6.0 +parso==0.7.1 +pathspec==0.8.0 +pexpect==4.8.0 +pickleshare==0.7.5 +prompt-toolkit==3.0.8 Protego==0.1.16 +ptyprocess==0.6.0 pyasn1==0.4.8 -pyasn1-modules==0.2.7 -pycparser==2.19 +pyasn1-modules==0.2.8 +pycparser==2.20 PyDispatcher==2.0.5 -PyHamcrest==1.9.0 +Pygments==2.7.1 +PyHamcrest==2.0.2 pyOpenSSL==19.1.0 -python-dotenv==0.10.3 queuelib==1.5.0 -regex==2019.12.20 -requests==2.22.0 -scrapinghub==2.3.0 -Scrapy==1.8.0 +regex==2020.10.15 +Scrapy==2.4.0 service-identity==18.1.0 six==1.13.0 -toml==0.10.0 -Twisted==19.10.0 -typed-ast==1.4.0 -urllib3==1.25.7 -w3lib==1.21.0 -zope.interface==4.7.1 +toml==0.10.1 +traitlets==5.0.5 +Twisted==20.3.0 +typed-ast==1.4.1 +typing-extensions==3.7.4.3 +w3lib==1.22.0 +wcwidth==0.2.5 +zope.interface==5.1.2 From 255e9ec7c219b5e79a34068a653068c114e8f217 Mon Sep 17 00:00:00 2001 From: Livio Bieri Date: Mon, 19 Oct 2020 22:45:46 +0200 Subject: [PATCH 5/7] chore: bump dependencies & change searchterm --- requirements.txt | 14 +++++++++++++- tutti/spiders/immobilien.py | 2 +- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 385bd55..f26a1ec 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,12 +4,15 @@ attrs==19.3.0 Automat==0.8.0 backcall==0.2.0 black==20.8b1 +certifi==2020.6.20 cffi==1.14.3 -click==7.1.2 +chardet==3.0.4 +Click==7.0 constantly==15.1.0 cryptography==3.1.1 cssselect==1.1.0 decorator==4.4.2 +docker==4.3.1 hyperlink==20.0.1 idna==2.10 incremental==17.5.0 @@ -37,16 +40,25 @@ PyDispatcher==2.0.5 Pygments==2.7.1 PyHamcrest==2.0.2 pyOpenSSL==19.1.0 +python-dotenv==0.14.0 +PyYAML==5.3.1 queuelib==1.5.0 regex==2020.10.15 +requests==2.24.0 +retrying==1.3.3 +scrapinghub==2.3.1 Scrapy==2.4.0 service-identity==18.1.0 +shub==2.12.0 six==1.13.0 toml==0.10.1 +tqdm==4.11.2 traitlets==5.0.5 Twisted==20.3.0 typed-ast==1.4.1 typing-extensions==3.7.4.3 +urllib3==1.25.11 w3lib==1.22.0 wcwidth==0.2.5 +websocket-client==0.57.0 zope.interface==5.1.2 diff --git a/tutti/spiders/immobilien.py b/tutti/spiders/immobilien.py index 0b74a39..09c08db 100644 --- a/tutti/spiders/immobilien.py +++ b/tutti/spiders/immobilien.py @@ -30,7 +30,7 @@ def start_requests(self): yield scrapy.Request( callback=self.parse, dont_filter=True, - url=f"https://www.tutti.ch/de/immobilien/objekttyp/{self.object_type}/standort/ort-{self.searchterm}/typ/mieten" + url=f"https://www.tutti.ch/de/immobilien/objekttyp/{self.object_type}/standort/{self.searchterm}/typ/mieten" + f"?floor_area={self.min_sqm}&price=,{self.max_price}&rooms={self.rooms}&paging={page}", ) From 3cb32302977b21ee34b77492c1c794d4cc6ea4aa Mon Sep 17 00:00:00 2001 From: Livio Bieri Date: Tue, 20 Oct 2020 21:55:36 +0200 Subject: [PATCH 6/7] fix: time in ms, should fix future runs --- tutti/pipelines.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tutti/pipelines.py b/tutti/pipelines.py index f670db0..2619eb5 100644 --- a/tutti/pipelines.py +++ b/tutti/pipelines.py @@ -12,8 +12,11 @@ def open_spider(self, spider): self.last_job_executation_time = self.get_last_job_executation_time() def process_item(self, item, spider): - if self.last_job_executation_time < item["time"]: + item_time = item["time"] + + if self.last_job_executation_time <= item_time: self.handle_webhooks(item) + return item def get_last_job_executation_time(self): @@ -38,7 +41,7 @@ def get_last_job_executation_time(self): searchterm = metadata.get("spider_args", {}).get("searchterm", "") if self.spider.searchterm == searchterm: - return metadata["running_time"] + return int(metadata["running_time"] / 1000) return NO_EXECUTATION_TIME From b48a8c4a2d2ee7d458bfec0523521736e5f8d9e5 Mon Sep 17 00:00:00 2001 From: Livio Bieri Date: Tue, 20 Oct 2020 22:18:55 +0200 Subject: [PATCH 7/7] chore: fix dependency snafu --- requirements.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index f26a1ec..3ed212f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ black==20.8b1 certifi==2020.6.20 cffi==1.14.3 chardet==3.0.4 -Click==7.0 +Click==7.1.2 constantly==15.1.0 cryptography==3.1.1 cssselect==1.1.0 @@ -41,7 +41,7 @@ Pygments==2.7.1 PyHamcrest==2.0.2 pyOpenSSL==19.1.0 python-dotenv==0.14.0 -PyYAML==5.3.1 +PyYAML==5.1 queuelib==1.5.0 regex==2020.10.15 requests==2.24.0 @@ -49,7 +49,7 @@ retrying==1.3.3 scrapinghub==2.3.1 Scrapy==2.4.0 service-identity==18.1.0 -shub==2.12.0 +# shub==2.12.0 six==1.13.0 toml==0.10.1 tqdm==4.11.2 @@ -60,5 +60,5 @@ typing-extensions==3.7.4.3 urllib3==1.25.11 w3lib==1.22.0 wcwidth==0.2.5 -websocket-client==0.57.0 +websocket-client==0.54.0 zope.interface==5.1.2