-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadfilter2.py
472 lines (441 loc) · 19.1 KB
/
adfilter2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
# pylint: disable=C0301, C0103, C1001, C0111
# declare commonly used items as global
globalLibrary = {
# gen-2.cloudapp.net was an azure testing server that we used
'domain': 'gen-2.cloudapp.net',
'listdir': 'easylist.txt',
# server address of the socket we proxied ads through
'server_address': './uds_socket2'
}
# global dependencies
import re
import timeit
# unix sockets implementation
import socket
import sys
import os
# encryption
import base64
from Crypto.Cipher import AES
from Crypto import Random
# Look up class
# This generates a look up table based on easylist.txt
# Can be used to check if a string will be blocked by adblock filter
# Uses a python dict, equivalent of a hash table to allow for O(1) look up time, assuming no hash collision
# Will take O(F) [F is size of filters] to generate hash tables at beginning
class lookup:
# Function to parse element rules and put into look up table
# Can either by class or ID
# This is determined by whether the rule starts with '#' or '.'
def __parseElemRule(self, elemText):
# Function to convert elem rules into regex
def __toRegexp(elemText):
# parse rule text into regexp
regexp = re.escape(elemText)
regexp.replace("/*", ".*")
regexp.replace("/^", r"""[^A-Za-z0-9_\-.%]""")
return regexp
# Function to generate lookup ID
def __genLookupId(idRule):
if "*" in idRule['T'] or "^" in idRule['T']:
self.__lookup['id']['ex'].append(idRule)
else:
self.__lookup['id']['short'][idRule['T']] = idRule
# Function to generate lookup class
def __genLookupClass(classRule):
if "*" in classRule['T'] or "^" in classRule['T']:
self.__lookup['class']['ex'].append(classRule)
else:
self.__lookup['class']['short'][classRule['T']] = classRule
# If beginning of easylist line is '#'
# Then assemble dictionary by storing it as an ID
# This is done according to how easylist is created
if elemText[0] == "#":
# assemble rule (dict)
rule = {}
# rule['T'] stores raw rule
rule['T'] = elemText[1:]
# rule['R'] stores regexp rule
rule['R'] = __toRegexp(elemText)
# rule['X'] stores the compiled regexp rule
rule['X'] = re.compile(rule['R'])
# store rule
self.__rules['id'].append(rule)
__genLookupId(self.__rules['id'][len(self.__rules['id'])-1])
# If beginning of easylist line is '.'
# Then assemble dictionary by storing it as a class
elif elemText[0] == ".":
# assemble rule (dict)
rule = {}
# rule['T'] stores raw rule
rule['T'] = elemText[1:]
# rule['R'] stores rule turned into regex
rule['R'] = __toRegexp(elemText)
# rule['X'] stores compiled regex
rule['X'] = re.compile(rule['R'])
# store rule
self.__rules['class'].append(rule)
__genLookupClass(self.__rules['class'][len(self.__rules['class'])-1])
# Function to parse URL blocking rules and put into lookup table
def __parseUrlRule(self, urlText):
# Function to convert url rules into regex
def __toRegexp(urlText):
# parse rule text into regexp
regexp = re.escape(urlText)
if regexp[0:1] == "/|":
regexp = "^" + regexp[2:]
if regexp[(len(regexp) - 2):(len(regexp) - 1)] == "/|":
regexp = regexp[:(len(regexp) - 3)] + "$"
regexp.replace("/*", ".*")
regexp.replace("/^", r"""[^A-Za-z0-9_\-.%]""")
return regexp
# Main function to generate look up table for URLs with substrings
# This function takes a substring of 8 from each easylist filter
# These substrings are set as the hash table look up values
# because these substr are bound to match, we only check against filters if there is a match in substr
def __genLookup(urlRule):
# find longest consecutive (uninterrupted by regexp) substring
cStrs = re.compile(r"""[^\|\^\*]*""").findall(urlRule['T'])
if len(cStrs) > 0:
# set preferred number of substrings
n = self.__pref['n']['url']
maxLen = 0
# loop through all substrings
for i in range(len(cStrs)-1):
# find longest uninterrupted substring
if len(str(cStrs[i])) > len(str(cStrs[maxLen])):
# set that substring to maxLen
maxLen = i
# if longest consecutive substring is longer than n - 1 chars
if len(cStrs[maxLen]) > (n - 1):
# take shortcut from maxLen to n
shortcut = cStrs[maxLen][:n]
# if lookup key already exists (another rule with same substring/key)
if shortcut in self.__lookup['url']['short']:
# append shortcut to rule
self.__lookup['url']['short'][shortcut].append(urlRule)
else:
# if lookup key does not already exist
self.__lookup['url']['short'][shortcut] = []
self.__lookup['url']['short'][shortcut].append(urlRule)
else:
# if less than 7 chars, append to manual lookup table
self.__lookup['url']['ex'].append(urlRule)
else:
# if no chars, append to manual lookup table
self.__lookup['url']['ex'].append(urlRule)
# assemble rule (dict)
rule = {}
# store raw rules into rule['T']
rule['T'] = urlText
# store regex rules into rule['R']
rule['R'] = __toRegexp(urlText)
# store compiled regex rule into rule['R']
rule['X'] = re.compile(rule['R'])
# store rule under the URL class
self.__rules['url'].append(rule)
__genLookup(self.__rules['url'][len(self.__rules['url'])-1])
def parseList(self, listdir=globalLibrary['listdir']):
# parse easylist
filterList = open(listdir)
breakAll = False
for line in filterList:
# escape lines
line = line.replace("\n", "")
# end parsing, since asset listings signifies end
if line.find('! Asset Listings') != -1:
print ("Easylist.txt has been loaded into hashtable~")
breakAll = True
# if entry is not a comment
if line[0] != "!" and breakAll != True:
# if entry is a universal element hiding rule
if line[:2] == "##":
self.__parseElemRule(line[2:])
# if entry is a whitelist rule, do nothing
elif line[:2] == "@@":
pass
# if entry is a url rule (no identifier defaults to url rule)
else:
self.__parseUrlRule(line)
# initialize class
def __init__(self):
# preferred number for various substrings
self.__pref = {
'n': {
'url': 8,
'id': 5,
'class': 5
}
}
# rules dict for storing url, id, class, elem rules
self.__rules = {
'url': [],
'id': [],
'class': [],
'elem': []
}
# create look up table for short cut and substrings
self.__lookup = {
'url': {
'short': {},
'ex': []
},
'id': {
'short': {},
'ex': []
},
'class': {
'short': {},
'ex': []
},
'elem': {
'short': {},
'ex': []
}
}
self.parseList()
# function to match elements and classes against easylist
def match_elem(self, elemType, elemVals):
# TO-DO: add iteration through EX rules
# If element is ID, then look through ID list
if elemType == 'id':
if elemVals in self.__lookup['id']['short']:
return [True]
else:
return [False]
# if element is of type class, look through class list for each class item
elif elemType == 'class':
# split up classes
elemVals = elemVals.split(" ")
# check individually for each one
if len(elemVals) > 1:
eachResult = []
for elemVal in elemVals:
if elemVal in self.__lookup['class']['short']:
eachResult.append(True)
else:
eachResult.append(False)
return eachResult
else:
if elemVals[0] in self.__lookup['class']['short']:
return [True]
else:
return [False]
else:
return [False]
# Function to match URL against easylist hashtable
def match_url(self, queryUrl):
# get substr max length
n = self.__pref['n']['url']
# if query url is at least as long as n
if n < len(queryUrl) + 1:
substrings = []
# extract substrings of length n
i = len(queryUrl) - n
while i > -1:
substrings.append(queryUrl[i:(n+i)])
i = i - 1
# lookup substrings in shortcut table
for substring in substrings:
#print substring
# If substring exists in shortcut table
if substring in self.__lookup['url']['short']:
# Search regex in depth to see if match
for rule in self.__lookup['url']['short'][substring]:
if rule['X'].search(queryUrl) != None:
return True
# if less than substring length, then check raw rules
for rawRule in self.__lookup['url']['ex']:
if rawRule['X'].search(queryUrl) != None:
return True
# Function used to debug
def debug(self):
counter = 0
for item in self.__lookup['url']['short']:
counter = counter + len(self.__lookup['url']['short'][item])
print (counter)
print (len(self.__lookup['url']['short']))
print (len(self.__lookup['url']['ex']))
print (len(self.__rules['url']))
# This class was originally used to encrypt and proxy ads through the publisher's server
# Doing so would unblock ads from the ad-blocker
# It would be impossible for adblockers to block ads unless they listed the root domain on easylist.txt
# Please note that this section of the code if not 100% complete, and it can't be tested unless an apache server with web sockets is available
# Therefore I have decided to comment out the code that initializes this class. Feel free to read through it if you like.
class dataHandler:
# We used standard aes encrypt and decrypt function for testing. This would've had to be changed
# should adblock eventually figure out we were using this type of encryption
def __aesEncrypt(self, path):
# checking if string is a multipe of 16 in lenght if not make it a multiple of 16
BS = 16
pad = lambda s: s + (BS - len(s) % BS) * chr(BS - len(s) % BS)
# unpad = lambda s : s[:-ord(s[len(s)-1:])]
# run aes encryption code from the library
urlPath = pad(path)
key = '0123456789abcdef'
iv = Random.new().read(AES.block_size)
mode = AES.MODE_CBC
# Use AES
cipher = AES.new(key, mode, iv)
text = urlPath
cipherText = cipher.encrypt(text)
return base64.b64encode(iv + cipherText)
# this function takes a piece of text and calls the encrypt function on that text
def __elemEncrypt(self, elemText):
# return elemText[len(elemText) - 2:] + elemText[:len(elemText) - 2]
temp = base64.b64encode(elemText)
# temp = temp.replace("=", "")
# temp = temp.replace("+", "")
# temp = temp.replace("/", "")
return temp
# this function is made to handle HTML tags that are blocked by adblocker
def handleHTMLTag(self, htmltag):
def __handle(attr, val):
# if html tag blocked is an ID
if attr == 'id':
# lookup ID, if indeed blocked
if self.mylookup.match_elem('id', val) == [True]:
# encrypt value
return self.__elemEncrypt(val)
# if html tag blocked if a class
elif attr == 'class':
# lookup CLASS, if indeed true
post = self.mylookup.match_elem('class', val)
# sockets didn't handle classes well since there were multiple classes
# ie class="col-md-6 pull-right"
if len(post) > 1:
# split classes and check each class individually, otherwise leave them alone
vals = val.split(" ")
mustReturn = False
for i in range(len(post) - 1):
if post[i] == True:
# replace in-place
mustReturn = True
vals[i] = self.__elemEncrypt(vals[i])
if mustReturn == True:
return " ".join(vals)
else:
if post == [True]:
return self.__elemEncrypt(val)
# otherwise html tag blocked is a url
else:
if val.find('http') != -1:
# lookup url, if indeed true
if self.mylookup.match_url(val) == True:
# replace url with aes encrypted
return "http://" + globalLibrary['domain'] + "/" + self.__aesEncrypt(val)
returntag = htmltag
pairs = []
# check against html attr regex to see if matches are available
htmlattrs = self.regexps['htmlattrs'].findall(htmltag[1])
# if length of findall returned is greater than 1, append entire item
if len(htmlattrs) > 1:
for item in htmlattrs:
pairs.append(item)
# else if length is just 1, then append the first item
elif len(htmlattrs) == 1:
pairs.append(htmlattrs[0])
# if there was a match at all
if len(pairs) > 0:
# loop through items generated in pairs
for item in pairs:
# check against htmlkey regex to see if match
thekey = self.regexps['htmlkey'].findall(item)
if len(thekey) == 0:
thekey = ""
else:
thekey = thekey[0]
# check against htmlval to see if match
theval = self.regexps['htmlval'].findall(item)
if len(theval) == 0:
theval = ""
else:
theval = theval[0]
# run matched keys through handler
thecheck = __handle(thekey, theval)
# debugging
if thecheck != None:
print ( "---TC--- " + str(thecheck) + " ---K--- " + str(thekey) + " ---V--- " + str(theval))
theitem = item
theitem = theitem.replace(theval, thecheck)
returntag = (returntag[0], returntag[1].replace(item, theitem))
return returntag
# TO-DO: CSS handler, similar to html handler, just need to write regex to parse css files
# should be relatively straight forward since css files are very structured
def handleCSS(self, cssval):
pass
# TO-DO: JS handler, similar to html handler, just need to write regex to parse javascript files
# This one may be slightly more complicated, since js files are more varied...
def handleJS(self, jsval):
pass
# main handler to determine whether to direct databit into handleCSS, handleJS or handleHTMLTag
# TO-DO: currently unfinished, need to write a regex to determine from string
def handle(self, dataBit):
# check against regex for html tags
HTMLTags = self.regexps['htmltag'].findall(dataBit)
# if matched regex
if len(HTMLTags) > 0:
# send into html tag handle
for HTMLTag in HTMLTags:
post = self.handleHTMLTag(HTMLTag)
if post != HTMLTag:
dataBit = dataBit.replace(HTMLTag[1], post[1])
return dataBit
# Init function to call class
def __init__(self):
# Make sure the socket does not already exist
try:
os.unlink(globalLibrary['server_address'])
except OSError:
if os.path.exists(globalLibrary['server_address']):
raise
# Create UDS socket
self.mySock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
# Bind the socket to the port
print ( 'starting up on %s' % globalLibrary['server_address'])
self.mySock.bind(globalLibrary['server_address'])
# Initialize lookup tables
self.mylookup = lookup()
self.mylookup.debug()
# Initialize regex for checking HTML items
# TO-DO: Regex for JS and CSS items
self.regexps = {
"htmltag": re.compile(r"""<([\w\-_]+) ((?:[\w\-_:]+ ?= ?["'].*?["'] ?)+)\\?>"""),
"htmlattrs": re.compile(r""".+? ?= ?["'].*?["']"""),
"htmlkey": re.compile(r"""(.+?) ?= ?"""),
"htmlval": re.compile(r""".+? ?= ?["'](.*?)["']""")
}
# self.encryptLib = {
# 'seed': [4, 2, 1],
# 'charmap': {},
# 'lookup': {}
# }
startTime = None
# Listen for incoming connections
self.mySock.listen(1)
while True:
# Wait for a connection
print ( 'waiting for a connection')
self.connection, self.client_address = self.mySock.accept()
try:
print ( 'connection from', self.client_address)
# Receive the data in small chunks and retransmit it
while True:
if startTime == None:
startTime = timeit.default_timer()
dataBit = self.connection.recv(8192)
#print 'received "%s"' % data
if dataBit:
dataBit = self.handle(dataBit)
print ( 'sending data back to the client')
self.connection.sendall(dataBit)
else:
endTime = timeit.default_timer()
print ( 'no more data from', self.client_address, "total time = " + str(endTime - startTime))
startTime = None
break
finally:
# Clean up the connection
self.connection.close()
# lookup = lookup()
# test.debug()