Skip to content

Commit

Permalink
[feature] Add proxies to estela-entrypoint
Browse files Browse the repository at this point in the history
* [MRG] Add proxies to estela-entrypoint (#30)
* Add EstelaProxyMiddleware
* Add proxy_response_bytes stat
* Include proxies usage info in update_job util

---------

Co-authored-by: joaquin garmendia <[email protected]>
  • Loading branch information
mgonnav and joaquingx authored Oct 30, 2023
1 parent 3668436 commit cc1cf07
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 1 deletion.
4 changes: 4 additions & 0 deletions estela_scrapy/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,10 @@ def spider_closed(self, spider, reason):
total_bytes=stats.get("downloader/response_bytes", 0),
item_count=stats.get("item_scraped_count", 0),
request_count=stats.get("downloader/request_count", 0),
proxy_usage_data={
"proxy_name": stats.get("downloader/proxy_name", ""),
"bytes": stats.get("downloader/proxies/response_bytes", 0),
},
)

parsed_stats = json.dumps(stats, default=json_serializer)
Expand Down
64 changes: 64 additions & 0 deletions estela_scrapy/middlewares.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,28 @@
import logging
import os

from scrapy.exceptions import NotConfigured
from scrapy.utils.python import to_bytes
from scrapy.utils.request import request_fingerprint
from twisted.web import http

from estela_scrapy.utils import parse_time, producer

proxy_logger = logging.getLogger("proxy_mw")


def get_header_size(headers):
size = 0
for key, value in headers.items():
if isinstance(value, (list, tuple)):
for v in value:
size += len(b": ") + len(key) + len(v)
return size + len(b"\r\n") * (len(headers.keys()) - 1)


def get_status_size(response_status):
return len(to_bytes(http.RESPONSES.get(response_status, b""))) + 15


class StorageDownloaderMiddleware:
def process_response(self, request, response, spider):
Expand All @@ -21,3 +40,48 @@ def process_response(self, request, response, spider):
}
producer.send("job_requests", data)
return response


class EstelaProxyMiddleware:
@classmethod
def from_crawler(cls, crawler):
estela_proxies_enabled = os.getenv("ESTELA_PROXIES_ENABLED")
if not estela_proxies_enabled:
raise NotConfigured
return cls(crawler.settings, crawler.stats, crawler.spider)

def get_proxies_attributes(self, settings):
username = os.getenv("ESTELA_PROXY_USER")
password = os.getenv("ESTELA_PROXY_PASS")
port = os.getenv("ESTELA_PROXY_PORT")
url = os.getenv("ESTELA_PROXY_URL")
return username, password, port, url

def __init__(self, settings, stats, spider):
self.username, self.password, self.port, self.url = self.get_proxies_attributes(
settings
)
self.stats = stats
self.stats.set_value(
"downloader/proxy_name", os.getenv("ESTELA_PROXY_NAME"), spider=spider
)

def process_request(self, request, spider):
if not request.meta.get("proxies_disabled"):
proxy_logger.debug("Using proxies with request %s", request.url)
host = f"http://{self.username}:{self.password}@{self.url}:{self.port}"
request.meta["proxy"] = host
self.stats.inc_value("downloader/proxies/count", spider=spider)

def process_response(self, request, response, spider):
if not request.meta.get("proxies_disabled"):
reslen = (
len(response.body)
+ get_header_size(response.headers)
+ get_status_size(response.status)
+ 4
)
self.stats.inc_value(
"downloader/proxies/response_bytes", reslen, spider=spider
)
return response
1 change: 1 addition & 0 deletions estela_scrapy/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def update_deprecated_classpaths(settings):

def load_default_settings(settings):
downloader_middlewares = {
"estela_scrapy.middlewares.EstelaProxyMiddleware": 702,
"estela_scrapy.middlewares.StorageDownloaderMiddleware": 1000,
}
spider_middlewares = {}
Expand Down
3 changes: 3 additions & 0 deletions estela_scrapy/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
from datetime import date, datetime, timedelta

import requests
Expand Down Expand Up @@ -35,6 +36,7 @@ def update_job(
total_bytes=0,
item_count=0,
request_count=0,
proxy_usage_data={},
):
requests.patch(
job_url,
Expand All @@ -44,6 +46,7 @@ def update_job(
"total_response_bytes": total_bytes,
"item_count": item_count,
"request_count": request_count,
"proxy_usage_data": json.dumps(proxy_usage_data),
},
headers={"Authorization": "Token {}".format(auth_token)},
)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"Scrapy>=1.0",
"requests",
"redis",
"estela-queue-adapter @ git+https://github.com/bitmakerla/estela-queue-adapter.git"
"estela-queue-adapter @ git+https://github.com/bitmakerla/estela-queue-adapter.git",
],
entry_points={
"console_scripts": [
Expand Down

0 comments on commit cc1cf07

Please sign in to comment.