Skip to content
This repository has been archived by the owner on Sep 29, 2024. It is now read-only.

feat: proof of concept for tutti.ch/immobilien #2

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 50 additions & 25 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,39 +1,64 @@
appdirs==1.4.3
appnope==0.1.0
attrs==19.3.0
Automat==0.8.0
black==19.10b0
certifi==2019.11.28
cffi==1.13.2
backcall==0.2.0
black==20.8b1
certifi==2020.6.20
cffi==1.14.3
chardet==3.0.4
Click==7.0
Click==7.1.2
constantly==15.1.0
cryptography==2.8
cryptography==3.1.1
cssselect==1.1.0
hyperlink==19.0.0
idna==2.8
decorator==4.4.2
docker==4.3.1
hyperlink==20.0.1
idna==2.10
incremental==17.5.0
lxml==4.4.2
msgpack==0.6.2
parsel==1.5.2
pathspec==0.7.0
ipdb==0.13.4
ipython==7.18.1
ipython-genutils==0.2.0
itemadapter==0.1.1
itemloaders==1.0.3
jedi==0.17.2
jmespath==0.10.0
lxml==4.6.1
mypy-extensions==0.4.3
parsel==1.6.0
parso==0.7.1
pathspec==0.8.0
pexpect==4.8.0
pickleshare==0.7.5
prompt-toolkit==3.0.8
Protego==0.1.16
ptyprocess==0.6.0
pyasn1==0.4.8
pyasn1-modules==0.2.7
pycparser==2.19
pyasn1-modules==0.2.8
pycparser==2.20
PyDispatcher==2.0.5
PyHamcrest==1.9.0
Pygments==2.7.1
PyHamcrest==2.0.2
pyOpenSSL==19.1.0
python-dotenv==0.10.3
python-dotenv==0.14.0
PyYAML==5.1
queuelib==1.5.0
regex==2019.12.20
requests==2.22.0
scrapinghub==2.3.0
Scrapy==1.8.0
regex==2020.10.15
requests==2.24.0
retrying==1.3.3
scrapinghub==2.3.1
Scrapy==2.4.0
service-identity==18.1.0
# shub==2.12.0
six==1.13.0
toml==0.10.0
Twisted==19.10.0
typed-ast==1.4.0
urllib3==1.25.7
w3lib==1.21.0
zope.interface==4.7.1
toml==0.10.1
tqdm==4.11.2
traitlets==5.0.5
Twisted==20.3.0
typed-ast==1.4.1
typing-extensions==3.7.4.3
urllib3==1.25.11
w3lib==1.22.0
wcwidth==0.2.5
websocket-client==0.54.0
zope.interface==5.1.2
27 changes: 12 additions & 15 deletions tutti/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,35 @@
from .utils import post_to_slack


NO_EXECUTATION_TIME = 0


class TuttiPipeline:
def open_spider(self, spider):
self.spider = spider
self.last_job_ids = self.get_last_job_ids()
self.last_job_executation_time = self.get_last_job_executation_time()

def process_item(self, item, spider):
if item["id"] not in self.last_job_ids:
item_time = item["time"]

if self.last_job_executation_time <= item_time:
self.handle_webhooks(item)

return item

def get_last_job_ids(self):
def get_last_job_executation_time(self):
project_id = os.environ.get("SCRAPY_PROJECT_ID")
api_key = self.spider.settings.get("SCRAPINGHUB_API_KEY")

if not project_id or not api_key:
return []
return NO_EXECUTATION_TIME

client = ScrapinghubClient(api_key)
project = client.get_project(project_id)
jobs = project.jobs.list()

if not jobs:
return []

# find last job for spider searchterm same spider
# can be invoked with different searchterms
last_matching_job = None
return NO_EXECUTATION_TIME

for each in jobs:
key = each["key"]
Expand All @@ -40,13 +41,9 @@ def get_last_job_ids(self):
searchterm = metadata.get("spider_args", {}).get("searchterm", "")

if self.spider.searchterm == searchterm:
last_matching_job = job
break

if not last_matching_job:
return []
return int(metadata["running_time"] / 1000)

return [item["id"] for item in last_matching_job.items.iter()]
return NO_EXECUTATION_TIME

def handle_webhooks(self, item):
slack_webhook = self.spider.settings.get("SLACK_WEBHOOK")
Expand Down
67 changes: 67 additions & 0 deletions tutti/spiders/immobilien.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
import scrapy
import json
import re


class ImmobilienSpider(scrapy.Spider):
name = "immobilien"

def __init__(
self,
pages=1,
searchterm=None,
object_type=None,
max_price=None,
min_sqm=None,
rooms=None,
**kwargs,
):
super().__init__(**kwargs)
self.pages = int(pages)
self.searchterm = searchterm if searchterm else ""
self.object_type = object_type if object_type else "wohnungen"
self.max_price = max_price if max_price else ""
self.min_sqm = min_sqm if min_sqm else ""
self.rooms = rooms if rooms else ""

def start_requests(self):
for page in range(1, self.pages + 1):
yield scrapy.Request(
callback=self.parse,
dont_filter=True,
url=f"https://www.tutti.ch/de/immobilien/objekttyp/{self.object_type}/standort/{self.searchterm}/typ/mieten"
+ f"?floor_area={self.min_sqm}&price=,{self.max_price}&rooms={self.rooms}&paging={page}",
)

def transform_raw(self, data):
return {
"id": data["id"],
"subject": data.get("subject"),
"body": data.get("body"),
"price": data.get("price"),
"time": data.get("epoch_time"),
"region": data.get("location_info", {}).get("region_name"),
"coordinates": data.get("location"),
"plz": data.get("location_info", {}).get("plz"),
"link": f"https://www.tutti.ch/vi/{data['id']}",
"thumbnail": f"https://c.tutti.ch/images/{data.get('thumb_name')}",
"images": [
f"https://c.tutti.ch/images/{image}"
for image in data.get("image_names", [])
],
"_meta": data,
}

def parse(self, response):
pattern = re.compile(r"window.__INITIAL_STATE__=(.*)", re.MULTILINE | re.DOTALL)

data = response.xpath('//script[contains(., "INITIAL_STATE")]/text()').re(
pattern
)[0]

items = json.loads(data)["items"]
offers = reversed(sorted(items.items(), key=lambda item: item[1]["epoch_time"]))

for _, offer in offers:
yield self.transform_raw(offer)
15 changes: 12 additions & 3 deletions tutti/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,24 @@


def post_to_slack(item, webhook):
coordinates = item.get("coordinates")

if coordinates:
params = f"ll={coordinates['lat']},{coordinates['lon']}"
link = f"https://www.google.com/maps?{params}"
location = f":round_pushpin: <{link}|Region {item['region']}, {item['plz']}>\n"
else:
location = f":round_pushpin: Region {item['region']}, {item['plz']}\n"

payload = {
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": f"*<{item['link']}|{item['subject']}>*\n\n"
+ f":round_pushpin: Region {item['region']}, {item['plz']}\n"
+ f"*:heavy_dollar_sign: Price {item['price']}*",
"text": f"*<{item['link']}|{item['subject']}>*\n"
+ location
+ f"*:heavy_dollar_sign: {item['price']}*",
},
"accessory": {
"type": "image",
Expand Down