-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhttp_requests_ip.py
70 lines (57 loc) · 2.5 KB
/
http_requests_ip.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import argparse
import csv
import logging
import os
import sys
from utils import Log4JProxyHandler, spark_session, read_nginx_logs
STORAGE_ACCOUNT_NAME = os.environ['STORAGE_ACCOUNT_NAME']
STORAGE_ACCOUNT_KEY = os.environ['STORAGE_ACCOUNT_KEY']
logging.basicConfig(
stream=sys.stdout,
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
)
LOGGER = logging.getLogger("HttpRequestsSingleIp")
azure_logger = logging.getLogger("azure")
azure_logger.setLevel(logging.ERROR)
def exclude_requests(df):
# Exclude requests from Nessus
df = df.filter(df.remote_ip != "10.6.20.97")
# Exclude requests from PRTG
df = df.filter(df.user_agent != "Mozilla/5.0 (compatible; PRTG Network Monitor (www.paessler.com); Windows)")
return df
def filter_requests(df, ip):
"""Apply filters to the DataFrame to only include requests to the IP address of interest.
"""
df = df.filter(df.remote_ip == ip)
df.show()
return df
def write_report(df, filename):
"""For the passed-in DataFrame, write out the contents to a CSV.
"""
df = df.coalesce(1).orderBy("timestamp")
temp_file = open(f"/out/{filename}", "w+")
fieldnames = df.columns
writer = csv.DictWriter(temp_file, fieldnames)
writer.writerow(dict(zip(fieldnames, fieldnames)))
for row in df.toLocalIterator():
writer.writerow(row.asDict())
if __name__ == "__main__":
all_args = argparse.ArgumentParser()
all_args.add_argument("--start", action="store", type=str, required=True, help="Starting (earliest) Nginx log timestamp, YYYYmmddHH")
all_args.add_argument("--end", action="store", type=str, required=True, help="Ending (latest, inclusive) Nginx log timestamp, YYYYmmddHH")
all_args.add_argument("--ip", action="store", type=str, required=True, help="IP address to filter HTTP requests for")
all_args.add_argument("--filename", action="store", type=str, required=True, help="Filename for the CSV report output")
args = vars(all_args.parse_args())
start_timestamp = args["start"]
end_timestamp = args["end"]
ip = str(args["ip"])
filename = str(args["filename"])
session = spark_session(STORAGE_ACCOUNT_NAME, STORAGE_ACCOUNT_KEY)
pyspark_handler = Log4JProxyHandler(session)
LOGGER.addHandler(pyspark_handler)
LOGGER.info("Starting report generation")
df = read_nginx_logs(start_timestamp, end_timestamp, session, STORAGE_ACCOUNT_NAME, STORAGE_ACCOUNT_KEY)
df = exclude_requests(df)
df = filter_requests(df, ip)
write_report(df, filename)