English | 中文
: gets the queue type of the request task, The default type is memory
SCHEDULER_QUEUE_CLASS = 'aioscrapy.queue.memory.SpiderPriorityQueue'
SCHEDULER_QUEUE_CLASS = 'aioscrapy.queue.redis.SpiderPriorityQueue'
# redis parameter
'queue': {
'url': 'redis://',
'max_connections': 2,
'timeout': None,
'retry_on_timeout': True,
'health_check_interval': 30,
SCHEDULER_QUEUE_CLASS = 'aioscrapy.queue.rabbitmq.SpiderPriorityQueue'
# RabbitMq parameter
'queue': {
'url': "amqp://guest:[email protected]:5673/",
'connection_max_size': 2,
'channel_max_size': 10,
: filter duplicate urls, No default configuration.
Save URL fingerprint information to disk.
DUPEFILTER_CLASS = 'aioscrapy.dupefilters.disk.RFPDupeFilter'
Save URL fingerprint information to redis, Hash the URL.
DUPEFILTER_CLASS = 'aioscrapy.dupefilters.redis.RFPDupeFilter'
Save URL fingerprint information to redis, use Bloom filter.
DUPEFILTER_CLASS = 'aioscrapy.dupefilters.redis.BloomDupeFilter'
: Whether to close crawler when queue has no work, Default False
How to deploy distributed crawler of aio-scrapy with scrapyd
Install scrapyd
pip install scrapyd
Modify scrapyd configuration
eggs_dir = eggs
logs_dir = logs
items_dir =
jobs_to_keep = 5
dbs_dir = dbs
max_proc = 0
max_proc_per_cpu = 4
finished_to_keep = 100
poll_interval = 5.0
bind_address =
http_port = 6800
debug = off
# runner = scrapyd.runner # The original configuration
runner = aioscrapy.scrapyd.runner # Replace runner with aio-scrapy runner
application = scrapyd.app.application
launcher = scrapyd.launcher.Launcher
webroot = scrapyd.website.Root
schedule.json = scrapyd.webservice.Schedule
cancel.json = scrapyd.webservice.Cancel
addversion.json = scrapyd.webservice.AddVersion
listprojects.json = scrapyd.webservice.ListProjects
listversions.json = scrapyd.webservice.ListVersions
listspiders.json = scrapyd.webservice.ListSpiders
delproject.json = scrapyd.webservice.DeleteProject
delversion.json = scrapyd.webservice.DeleteVersion
listjobs.json = scrapyd.webservice.ListJobs
daemonstatus.json = scrapyd.webservice.DaemonStatus
Start scrapyd
scrapyd &
Please refer to scrapyd's documentation for more details.
Csv Bulk Storage Middleware
'aioscrapy.libs.pipelines.csv.CsvPipeline': 100,
# Format requirements for item
item = {
'__csv__': {
'filename': 'article', # 文件名 或 存储的路径及文件名 如:D:\article.xlsx
# Below are the item fields
'title': "title",
Execl Bulk Storage Middleware
'aioscrapy.libs.pipelines.execl.ExeclPipeline': 100,
# Format requirements for item
item = {
'__execl__': {
'filename': 'article', # File name to store, eg:D:\article.xlsx
'sheet': 'sheet1', # sheet name, default: sheet1
# 'img_fields': ['img'], # Specify the image fields when you want to download
# 'img_size': (100, 100) # the size of image
# Below are the item fields
'title': "title",
'img': "https://domain/test.png",
Mysql Bulk Storage Middleware
'aioscrapy.libs.pipelines.mysql.MysqlPipeline': 100,
# mysql parameter
# "default" is alias of the mysql pool
# Use:
# from aioscrapy.db import db_manager
# async with db_manager.get('default') as (conn, cur):
# print(await cur.execute('select 1'))
'default': {
'db': 'test',
'user': 'root',
'password': '123456',
'host': '',
'port': 3306,
'charset': 'utf8mb4',
# # "dev" is alias of the mysql pool
# 'dev': {
# 'db': 'test2',
# 'user': 'root',
# 'password': 'root',
# 'host': '',
# 'port': 3306,
# 'charset': 'utf8mb4',
# }
SAVE_CACHE_NUM = 1000 # Trigger mysql storage every 1000 item.
SAVE_CACHE_INTERVAL = 10 # Trigger mysql storage every 10 seconds.
# Format requirements for item
item = {
'__mysql__': {
'table_name': 'baidu', # table name of mysql
'insert_type': 'insert', # Save type for mysql
'db_alias': ['default'], # Alias of mysql to save
# Below are the item fields
'title': "title",
Mongo Bulk Storage Middleware
'aioscrapy.libs.pipelines.mongo.MongoPipeline': 100,
'default': {
'host': 'mongodb://root:[email protected]:27017',
'db': 'test',
SAVE_CACHE_NUM = 1000 # Trigger mysql storage every 1000 item.
SAVE_CACHE_INTERVAL = 10 # Trigger mysql storage every 10 seconds.
# Format requirements for item
item = {
'__mongo__': {
'db_alias': 'default', # Alias of mongo to save
'table_name': 'article', # table name of mongo
# 'db_name': 'xxx', # db name of mongo, If not specified, the default value is "MONGO_ARGS" in "db"
# Below are the item fields
'title': "title",
'aioscrapy.libs.pipelines.pg.PGPipeline': 100,
'default': {
'user': 'user',
'password': 'password',
'database': 'spider_db',
'host': ''
SAVE_CACHE_NUM = 1000 # 每1000个item触发一次存储
SAVE_CACHE_INTERVAL = 10 # 每10s触发一次存储
# Format requirements for item
item = {
'__pg__': {
'db_alias': 'default', # # Alias of PostgreSQL to save
'table_name': 'spider_db.article', # schema and table_name, Separate with "."
'insert_type': 'insert', # Save type for PostgreSQL
# 'on_conflict': 'id',
# Below are the item fields
'title': "title",