forked from diskoverdata/diskover-community
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdiskover.cfg.sample
141 lines (129 loc) · 5.93 KB
/
diskover.cfg.sample
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
; diskover config file
; if you make any changes, restart worker bots so they get the new config
[excludes]
; directory names and absolute paths you want to exclude from crawl, case-sensitive, can include wildcards (.* or backup* or /dir/dirname* or *tmp or *tmp* etc)
dirs = .*,.snapshot,.Snapshot,.zfs
; files you want to exclude from crawl, case-sensitive, can include wildcards (.*, *.doc or NULLEXT for files with no extension)
files = .*,Thumbs.db,.DS_Store,._.DS_Store,.localized,desktop.ini
[includes]
; directory names and absolute paths you want to include (whitelist), case-sensitive, you don't need to whitelist rootdir (-d rootdir)
;dirs = .recycle
; files you want to include (whitelist), case-sensitive
;files =
[ownersgroups]
; control how owner (username) and group fields are stored for file and directory docs
; store uid and gid's instead of trying to get owner and group names (default is False)
;uidgidonly = False
; owner/group names contain domain name set to True (default is False)
;domain = False
; character separator used on cifs/nfs mounts to separte user/group and domain name, usually \\ or @
;domainsep = \\
; if domain name comes first before character separator, set this to True, otherwise False (default is True)
;domainfirst = True
; when indexing owner and group fields, keep the domain name (default is False)
;keepdomain = False
[autotag]
; pattern dictionaries for diskover bots to use when auto-tagging, values are case-sensitive, can include wildcard for ext, name or path (tmp* or TMP* or *tmp or *TMP* etc)
; can also specify file name with json, example files = autotag.files.json
;files = [{"name": [], "name_exclude": [], "ext": ["tmp*", "TMP*", "temp*", "TEMP*", "cache*", "CACHE*"], "path": ["*/Application Support/*", "*/Containers/*"], "path_exclude": [], "mtime": 90, "atime": 0, "ctime": 90, "tag": "delete", "tag_custom": "autotag"}]
;dirs = [{"name": ["*tmp*", "*TMP*", "*temp*", "*TEMP*", "*Temp*", "*cache*", "*CACHE*", "*Cache*"], "name_exclude": ["*templates*", "*Templates*"], "path": ["*/Application Support/*", "*/Containers/*"], "path_exclude": [], "mtime": 90, "atime": 0, "ctime": 90, "tag": "delete", "tag_custom": "autotag"}]
[elasticsearch]
; uncomment the below three lines if you are using AWS ES
;aws = False
;host = search-diskover-es-cluster-eg3yztrvzb6qucroyyjk2vokza.ap-northeast-1.es.amazonaws.com
;port = 443
; below two lines are for local ES, comment out if you are using AWS ES
; set multiple hosts using comma such as eshost1,eshost2
host = localhost
port = 9200
; uncomment the below two lines if you installed X-Pack, for http-auth
;user = elastic
;password = changeme
; index name for ES, cli arg overwrites this
indexname = diskover-index
; timeout for connection to ES (default is 10)
timeout = 30
; number of connections kept open to ES when crawling (default is 10)
maxsize = 20
; max retries for ES operations (default is 0)
maxretries = 10
; wait for at least yellow status before bulk uploading (default is False), set to True if you want to wait
wait = False
; chunk size for ES bulk operations (default is 500)
chunksize = 1000
; number of shards for index (default is 5)
shards = 1
; number of replicas for index (default is 1)
replicas = 0
; the below settings are to optimize ES for crawling
; index refresh interval (default is 1s), set to -1 to disable refresh during crawl (fastest performance but no index searches), after crawl is set back to 1s
indexrefresh = 30s
; disable replicas during crawl - set to True to turn off replicas or False to keep on (default False), after crawl is set back to replicas value above
disablereplicas = True
; transaction log flush threshold size (default 512mb)
translogsize = 1gb
; search scroll size (default 100 docs)
scrollsize = 1000
[redis]
host = 127.0.0.1
port = 6379
; set to socket path to use unix socket instead of host/port, comment out to not use (default)
;socket = /tmp/redis.sock
;password =
; database to use (default is 0)
db = 0
; rq job default time out in sec (default 180)
timeout = 3600
; rq default ttl for key/results (default 500)
ttl = 500
; rq queue names to use (default is diskover, diskover_crawl, diskover_calcdir)
queue = diskover
queuecrawl = diskover_crawl
queuecalcdir = diskover_calcdir
[adaptivebatch]
; adaptive batch settings when using -a (intelligent crawling)
; batchsize (numbers of dirs) to start at
startsize = 50
; maximum size of batch
maxsize = 500
; when adjusting batch size use this for +/- (increases when queue is 0, decreases when > 0)
stepsize = 10
; max number of files in batch, above is ignored if file limit reached (default 50000)
maxfiles = 50000
[paths]
; used by diskover socket server
; path to diskover.py (default is ./diskover.py)
diskoverpath = ./diskover.py
; path to python executable (default is python)
pythonpath = python
[socketlistener]
; hostname and port (TCP) for diskover socket server for remote commands
host = localhost
port = 9999
; max connections for diskover socket server
maxconnections = 5
; port (TCP) for diskover socket server for messages from diskover treewalk client
twcport = 9998
[dupescheck]
; read size (bytes) for md5 sum check (how many bytes to read in at a time when md5 checking, default 64 KB)
readsize = 65536
; max size (bytes) of files to check (files larger than this will be skipped, default 1 GB)
maxsize = 1073741824
; bytes to check at start and end of file before doing md5 sum check (set large enough to account for file header info, default is 64)
checkbytes = 64
; try to restore times (mtime/atime) for files that get opened by byte check and md5
; set to True or False, default False (useful for cifs which does not work with noatime mount option)
restoretimes = False
; number of threads for calculating md5 checksum of files
threads = 8
[gource]
; should be set to same in diskover-gource.sh
maxfilelag = 0.1
[crawlapi]
; crawl api url endpoint
;url = http://localhost:8080/api
; optional api login
;user = admin
;pass = admin
; number of items per page for each directory list request
;pagesize = 1000