forked from thezedwards/webXray
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_webxray.py
755 lines (646 loc) · 20.4 KB
/
run_webxray.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
"""
Welcome to webxray! This file both launches an interactive mode and is
pre-configured to store data in sqlite, which is sufficient for many
users. For advanced users, the software supports also a massively
distributed scan infrastrcture, backend storage in postgres,
and has been used to build multi-billion record datasets. Many of
these options are available via command-line flags.
"""
# standard python packages
import datetime
import multiprocessing
import optparse
import os
import re
import socket
import sys
import time
import urllib.parse
import urllib.request
# set up database connection
db_engine = 'sqlite'
if db_engine == 'sqlite':
from webxray.SQLiteDriver import SQLiteDriver
sql_driver = SQLiteDriver()
# pool_size sets how many parallel processes are run
# when using sqlite we set to 1 to avoid issues with
# multiple processes trying to use sqlite.
pool_size = 1
elif db_engine == 'postgres':
from webxray.PostgreSQLDriver import PostgreSQLDriver
sql_driver = PostgreSQLDriver()
# if we are using postgres the database can handle many
# connections so we set pool_size to None which sets up
# one process per processor core
pool_size = None
else:
print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
quit()
# import our custom utilities
from webxray.Utilities import Utilities
utilities = Utilities(db_engine=db_engine)
# check for various dependencies, python version, etc.
utilities.check_dependencies()
# SET CONFIG
#
# There are a large number of setting for webXray, which are
# set in a 'config' variable. Two default configurations are
# 'haystack' which collects data needed for examining data transfers
# and 'forensic' which collects everything, including images,
# page text, and the content of files. It is A VERY BAD IDEA
# to conduct forensic scans on lists of random webpages
# as you may be downloading and storing files you do not want.
#
# Only use forensic when you are TOTALLY SURE you want to retain
# all site content on your machine. Advanced users can either
# edit config details directly in the database or create their
# own custom config in Utilities.py.
config = utilities.get_default_config('haystack')
# Set the client_id based on the hostname, you can put in
# a custom value of your choosing as well.
client_id = socket.gethostname()
####################
# HELPER FUNCTIONS #
####################
def quit():
"""
Make sure we close the db connection before we exit.
"""
print('------------------')
print('Quitting, bye bye!')
print('------------------')
sql_driver.close()
exit()
# quit
def interaction():
"""
Handles user interaction, alternative to command line flags, good
for most people.
"""
print('\tWould you like to:')
print('\t\t[C] Collect Data')
print('\t\t[A] Analyze Data')
print('\t\t[V] Visualize Data')
print('\t\t[PC] Policy Collect')
print('\t\t[PA] Policy Analyze')
print('\t\t[Q] Quit')
# loop until we get acceptable input
while True:
selection = input("\tSelection: ").lower()
acceptable_input = ['c','a','v','pc','pa','q']
if selection == 'q':
quit()
elif selection in acceptable_input:
break
else:
print('\t\tInvalid select, please try again.')
continue
# we are collecting new data
if selection == 'c':
print('\t===============')
print('\tCollecting Data')
print('\t===============')
print('\tWould you like to:')
print('\t\t[C] Create a New Database')
print('\t\t[A] Add to an Existing Database')
print('\t\t[Q] Quit')
# interaction: loop until we get acceptable input
while True:
selection = input("\tSelection: ").lower()
if selection == 'c':
break
elif selection == 'a':
break
elif selection == 'q':
quit()
else:
print('\t\tValid selections are C, A, and Q. Please try again.')
continue
if selection == 'c':
# collect - new db
print('\t----------------------')
print('\tCreating New Database')
print('\t----------------------')
print('\tDatabase name must be alpha numeric, and may contain a "_"; maximum length is 20 characters.')
# interaction: loop until we get acceptable input
while True:
db_name = input('\tEnter new database name: ').lower()
if len(db_name) <= 40 and re.search('^[a-zA-Z0-9_]*$', db_name):
print(f'\tCreating new db with name {db_name}')
break
else:
print('\tName was invalid, try again.')
continue
sql_driver.create_wbxr_db(db_name)
sql_driver.set_config(config)
elif selection == 'a':
# collect - add to db
print('\t---------------------------')
print('\tAdding to Existing Database')
print('\t---------------------------')
print('\tThe following webXray databases are available:')
db_name = utilities.select_wbxr_db()
if db_name:
print('\tUsing database: %s' % db_name)
else:
quit()
# we have selected the db to use, now move on to collection
print('\t--------------------')
print('\tSelecting Page List')
print('\t--------------------')
print('\tPlease select from the available files in the "page_lists" directory:')
# webXray needs a file with a list of page urls to scan, these files should be kept in the
# 'page_lists' directory. this function shows all available page lists and returns
# the name of the selected list.
files = os.listdir(path='./page_lists')
if len(files) == 0:
print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
print('ERROR: No page lists found, check page_lists directory.')
print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
quit()
# alpha sort file list for easier selection
files.sort()
# print out pages lists to choose from
print('\tPage Lists Available:')
for index,file in enumerate(files):
print('\t\t[%s] %s' % (index, file))
# interaction: loop until we get acceptable input
while True:
selection = input("\n\tChoose a page list by number: ")
if selection.isdigit():
selection = int(selection)
if selection >= 0 and selection < len(files):
break
else:
print('\tInvalid choice, try again.')
continue
else:
print('\tInvalid choice, try again.')
continue
pages_file_name = files[selection]
print('\tPages file is "%s"' % pages_file_name)
print('\t------------------')
print('\tBeginning webXray')
print('\t------------------')
time.sleep(1)
collect(db_name, pages_file_name)
print('\t---------------------')
print('\t Collection Finished!')
print('\t---------------------')
# let's us go back to analyze
interaction()
elif selection == 'a':
# analyze
print('\t==============')
print('\tAnalyzing Data')
print('\t==============')
print('\t----------------------------------------------')
print('\tThe following webXray databases are available:')
print('\t----------------------------------------------')
db_name = utilities.select_wbxr_db()
if db_name:
print('\tUsing database: %s' % db_name)
else:
quit()
print('\tUsing database: %s' % db_name)
# go do the analysis
analyze(db_name)
# restart interaction
interaction()
elif selection == 'pc':
# analyze
print('\t=====================')
print('\t Collecting Policies ')
print('\t=====================')
print('\t----------------------------------------------')
print('\tThe following webXray databases are available:')
print('\t----------------------------------------------')
db_name = utilities.select_wbxr_db()
if db_name:
print('\tUsing database: %s' % db_name)
else:
quit()
print('\tUsing database: %s' % db_name)
# go get the policies
collect(db_name,task='get_policy')
# restart interaction
interaction()
elif selection == 'pa':
# analyze
print('\t====================')
print('\t Analyzing Policies ')
print('\t====================')
print('\t----------------------------------------------')
print('\tThe following webXray databases are available:')
print('\t----------------------------------------------')
db_name = utilities.select_wbxr_db()
if db_name:
print('\tUsing database: %s' % db_name)
else:
quit()
print('\tUsing database: %s' % db_name)
# go get the policies
policy_report(db_name)
# restart interaction
interaction()
# interaction
def collect(db_name, pages_file_name=None, task='get_scan'):
"""
manage the loading of pages, extracting relevant data, and storing to db
may also be called in stand-alone with 'run_webxray.py -c [DB_NAME] [PAGE_FILE_NAME]'
"""
from webxray.Collector import Collector
# if db doesn't exist, create it
if sql_driver.db_exists(db_name) == 0:
print('\t------------------------------')
print('\tCreating DB: %s' % db_name)
print('\t------------------------------')
sql_driver.create_wbxr_db(db_name)
sql_driver.set_config(config)
# needed to display runtime info
start_time = datetime.datetime.now()
# the main event
collector = Collector(db_name,db_engine,client_id)
if task == 'get_scan':
build_task_queue(db_name, 'get_scan', pages_file_name=pages_file_name)
elif task=='get_policy':
build_task_queue(db_name, 'get_policy')
elif task=='get_random_crawl':
build_task_queue(db_name, 'get_random_crawl', pages_file_name=pages_file_name)
collector.run(task='process_tasks_from_queue', pool_size=pool_size)
# fyi
utilities.print_runtime('Data collection', start_time)
# collect
def build_task_queue(db_name, task, pages_file_name=None, crawl_file_name=None):
"""
builds the queue of pages to be scanned, does no scanning itself, can
only be called by CLI
"""
from webxray.Collector import Collector
# if db doesn't exist, create it
if sql_driver.db_exists(db_name) == 0:
print('\t------------------------------')
print('\tCreating DB: %s' % db_name)
print('\t------------------------------')
sql_driver.create_wbxr_db(db_name)
sql_driver.set_config(config)
# needed to display runtime info
start_time = datetime.datetime.now()
# the main event
collector = Collector(db_name,db_engine,client_id)
if task == 'get_scan':
print('\t---------------------------------')
print('\t Adding page scans to task queue ')
print('\t---------------------------------')
collector.build_scan_task_queue(params = {
'pages_file_name' : pages_file_name,
'flush_scan_task_queue' : True,
'task' : 'get_scan'
})
elif task == 'get_random_crawl':
print('\t-----------------------------------------')
print('\t Adding random crawl scans to task queue ')
print('\t-----------------------------------------')
collector.build_scan_task_queue(params = {
'pages_file_name' : pages_file_name,
'flush_scan_task_queue' : True,
'task' : 'get_random_crawl'
})
elif task == 'get_crawl':
print('\t-----------------------------')
print('\t Adding crawls to task queue ')
print('\t-----------------------------')
collector.build_crawl_task_queue(params = {
'crawl_file_name' : crawl_file_name,
'flush_crawl_task_queue' : True
})
elif task == 'get_policy':
print('\t-----------------------------------')
print('\t Adding policy scans to task queue ')
print('\t-----------------------------------')
collector.build_policy_task_queue(flush_policy_task_queue=True)
# fyi
utilities.print_runtime('Build task queue', start_time)
# build_task_queue
def worker_collect(db_name):
"""
manage the loading of pages, extracting relevant data, and storing to db
may also be called in stand-alone with 'run_webxray.py --worker [DB_NAME]'
"""
from webxray.Collector import Collector
# needed to display runtime info
start_time = datetime.datetime.now()
# the main event
collector = Collector(db_name,db_engine,client_id)
collector.run(task='process_tasks_from_queue', pool_size=pool_size)
# fyi
utilities.print_runtime('Data collection', start_time)
# worker_collect
def analyze(db_name):
"""
perform analysis, generate reports and store them in ./reports
may also be called in stand-alone with 'run_webxray.py -a [DB_NAME]'
"""
from webxray.Reporter import Reporter
# needed to display runtime info
start_time = datetime.datetime.now()
# set how many tlds you want to produce sub-reports for
num_tlds = None
# set reports to only get the top X results, set to None to get everything
num_results = 500
# set up a new reporter
reporter = Reporter(db_name, db_engine, num_tlds, num_results, flush_domain_owners=False)
# this is the main suite of reports, comment out those you don't need
reporter.generate_db_summary_report()
reporter.generate_stats_report()
reporter.generate_aggregated_tracking_attribution_report()
reporter.generate_3p_domain_report()
reporter.generate_3p_aggregate_owner_report()
reporter.generate_3p_request_report()
reporter.generate_3p_request_report('script')
reporter.generate_use_report()
# the following reports may produce very large files and are off by default
reporter.generate_per_site_network_report()
# reporter.generate_per_page_network_report()
# reporter.generate_all_pages_request_dump()
# reporter.generate_all_pages_cookie_dump()
# fyi
utilities.print_runtime('Report generation', start_time)
# analyze
def single(url):
"""
For one-off analyses printed to CLI, avoids db calls entirely
"""
from webxray.SingleScan import SingleScan
single_scan = SingleScan()
single_scan.execute(url, config)
# single
def policy_report(db_name):
"""
perform of policies, generate reports and store them in ./reports
may also be called in stand-alone with 'run_webxray.py -p [DB_NAME]'
"""
from webxray.Reporter import Reporter
# needed to display runtime info
start_time = datetime.datetime.now()
# set how many tlds you want to produce sub-reports for
num_tlds = None
# set reports to only get the top X results, set to None to get everything
num_results = 100
# set up a new reporter
reporter = Reporter(db_name, db_engine, num_tlds, num_results, flush_domain_owners=False)
# do relevant policy reports
reporter.initialize_policy_reports()
reporter.generate_policy_summary_report()
reporter.generate_policy_owner_disclosure_reports()
reporter.generate_policy_gdpr_report()
reporter.generate_policy_pacification_report()
reporter.generate_policy_pii_report()
# fyi
utilities.print_runtime('Report generation', start_time)
# policy_report
def rate_estimate(db_name, client_id):
"""
Tells us how much longer to go...
"""
print('Showing scan rate for database %s' % db_name)
if client_id:
print('\tclient_id is %s' % client_id)
else:
client_id = None
print()
print()
print('elapsed_minutes\tcurrent_rate\taverage_rate\tremaining_tasks\tremaining_hours')
print('---------------\t------------\t------------\t---------------\t---------------')
utilities = Utilities(db_name=db_name,db_engine=db_engine)
for result in utilities.stream_rate():
print('%s\t\t%s\t\t%s\t\t%s\t\t%s' % (
result[client_id]['elapsed_minutes'],
result[client_id]['current_rate'],
result[client_id]['average_rate'],
result[client_id]['remaining_tasks'],
result[client_id]['remaining_hours']
)
)
# rate_estimate
def store_results_from_queue():
"""
If we have results in our result_queue we will
process/store them. Can be run in parallel
with server if set to queue results.
"""
from webxray.Collector import Collector
collector = Collector(db_engine=db_engine)
collector.run(task='store_results_from_queue', pool_size=pool_size)
# store_results_from_queue
def run_client():
"""
Start the remote client, note this only performs scans
and uploads to the server and runs until stopped.
However, since Chrome can crash it is a good idea
to have this restarted periodically by a
cron job.
"""
from webxray.Client import Client
client = Client('YOUR_SERVER_URL')
client.run_client()
# run_client
if __name__ == '__main__':
print('''
_ __ __
__ _____| |__\ \/ /_ __ __ _ _ _
\ \ /\ / / _ \ '_ \\\\ /| '__/ _` | | | |
\ V V / __/ |_) / \| | | (_| | |_| |
\_/\_/ \___|_.__/_/\_\_| \__,_|\__, |
|___/
''')
# set up cli args
parser = optparse.OptionParser()
parser.add_option(
'--scan_pages',
action='store_true',
dest='scan_pages',
help='Scan Pages: Only scan URL specified - Args: [db_name] [page_file_name]'
)
parser.add_option(
'--crawl_sites',
action='store_true',
dest='crawl_sites',
help='Crawl Sites: Scan URL specified and 3 random internal pages - Args: [db_name] [page_file_name]'
)
parser.add_option(
'--build_queue',
action='store_true',
dest='build_queue',
help='Build page queue: Should be run on db server, leave scanning to workers - Args: [db_name] [page_file_name or crawl_file_name]'
)
parser.add_option(
'--worker',
action='store_true',
dest='worker',
help='Collect Unattended as Worker: Simpler Alternative to Distributed Client - Args: [db_name]'
)
parser.add_option(
'-s',
action='store_true',
dest='single',
help='Single Site: for One-Off Tests - Args [url to analyze]'
)
parser.add_option(
'-a',
action='store_true',
dest='analyze',
help='Analyze Unattended: Best for Large Datasets - Args: [db_name]'
)
parser.add_option(
'--policy_collect',
action='store_true',
dest='policy_collect',
help='Policy Collect Unattended: Best for Large Datasets - Args: [db_name]'
)
parser.add_option(
'--policy_analyze',
action='store_true',
dest='policy_report',
help='Policy Report Unattended: Best for Large Datasets - Args: [db_name]'
)
parser.add_option(
'--rate',
action='store_true',
dest='rate_estimate',
help='Estimates time remaining on scan - Args: [db_name]'
)
parser.add_option(
'--store_queue',
action='store_true',
dest='store_results_from_queue',
help='Stores any results in the queue - Args: [db_name]'
)
parser.add_option(
'--run_client',
action='store_true',
dest='run_client',
help='Runs the distributed client'
)
(options, args) = parser.parse_args()
# set mode
if options.scan_pages:
mode = 'scan_pages'
elif options.crawl_sites:
mode = 'crawl_sites'
elif options.build_queue:
mode = 'build_queue'
elif options.store_results_from_queue:
mode = 'store_results_from_queue'
elif options.worker:
mode = 'worker'
elif options.single:
mode = 'single'
elif options.analyze:
mode = 'analyze'
elif options.policy_collect:
mode = 'policy_collect'
elif options.policy_report:
mode = 'policy_report'
elif options.rate_estimate:
mode = 'rate_estimate'
elif options.run_client:
mode = 'run_client'
else:
mode = 'interactive'
# do what we're supposed to do
if mode == 'interactive':
interaction()
elif mode == 'scan_pages':
try:
db_name = args[0]
pages_file_name = args[1]
except:
print('Need a db name and pages file name!')
quit()
collect(db_name, pages_file_name=pages_file_name, task='get_scan')
elif mode == 'crawl_sites':
try:
db_name = args[0]
pages_file_name = args[1]
except:
print('Need a db name and pages file name!')
quit()
collect(db_name, pages_file_name=pages_file_name, task='get_random_crawl')
elif mode == 'single':
try:
url = args[0]
except:
print('URL needs to be supplied as an argument!')
quit()
single(url)
elif mode == 'analyze':
try:
db_name = args[0]
except:
print('Need a db name!')
quit()
analyze(db_name)
elif mode == 'policy_collect':
try:
db_name = args[0]
except:
print('Need a db name!')
quit()
collect(db_name,task='get_policy')
elif mode == 'policy_report':
try:
db_name = args[0]
except:
print('Need a db name!')
quit()
policy_report(db_name)
elif mode == 'worker':
try:
db_name = args[0]
except:
print('Need a db name!')
quit()
worker_collect(db_name)
elif mode == 'build_queue':
try:
db_name = args[0]
task = args[1]
except:
print('Need a db name and task name')
quit()
# if we are doing get_scan we also need a page file name
if task == 'get_scan' or task == 'get_random_crawl':
try:
page_file = args[2]
except:
print('Need a page file name for get_scan')
quit()
build_task_queue(db_name, task, pages_file_name=page_file)
elif task == 'get_crawl':
try:
page_file = args[2]
except:
print('Need a crawl file name for get_crawl')
quit()
build_task_queue(db_name, task, crawl_file_name=page_file)
else:
# get_policy
build_task_queue(db_name, task)
elif mode == 'store_results_from_queue':
store_results_from_queue()
elif mode == 'rate_estimate':
try:
db_name = args[0]
except:
print('Need a db name!')
quit()
try:
client_id = args[1]
except:
client_id = None
rate_estimate(db_name,client_id)
elif mode == 'run_client':
run_client()
quit()
# main