-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCsvFilter.py
35 lines (29 loc) · 1.15 KB
/
CsvFilter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import re
import time
import pandas as pd
import numpy as np
class CsvFilter:
def __init__(self, onMatch, onFailure, word_list):
self.onMatch=onMatch
self.onFailure=onFailure
self.word_list=word_list
badWordsList= self.word_list.values.tolist()
self.badWordsRegex = re.compile('|'.join(re.escape(x[0]) for x in badWordsList),re.IGNORECASE)
#print(self.badWordsRegex)
def FilterWords(self,chunk,n):
if(type(chunk)==(object)):
self.onMatch(chunk)
self.onFailure(chunk)
return 0
print("start")
df = chunk[chunk.apply(lambda record:self.badWordsRegex.match(record[0] + record[2] +record[4]) != None, raw = True, axis=1)]
list_df = [df[i:i+n] for i in range(0,df.shape[0],n)]
chunk= (pd.merge(chunk,df, indicator=True, how='outer')
.query('_merge=="left_only"')
.drop('_merge', axis=1))
list_df1 = [chunk[i:i+n] for i in range(0,chunk.shape[0],n)]
#print(df)
##end = time.time()
##print(end-start)
self.onMatch(list_df)
self.onFailure(list_df1)