-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLesson6-DatasetsQuestions.py
66 lines (50 loc) · 1.95 KB
/
Lesson6-DatasetsQuestions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/python3
"""
Starter code for exploring the Enron dataset (emails + finances);
loads up the dataset (pickled dict of dicts).
The dataset has the form:
enron_data["LASTNAME FIRSTNAME MIDDLEINITIAL"] = { features_dict }
{features_dict} is a dictionary of features associated with that person.
You should explore features_dict as part of the mini-project,
but here's an example to get you started:
enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000
"""
import joblib
enron_data = joblib.load(open("../final_project/final_project_dataset.pkl", "rb"))
# How many data points (people) are in the dataset?
print (len(enron_data))
print (len(enron_data.keys()))
# For each person, how many features are available?
print (len(enron_data['SKILLING JEFFREY K']))
# How many POIs are there in the E+F dataset?
#print (enron_data['SKILLING JEFFREY K']['poi'])
count = 0
for user in enron_data:
if enron_data[user]['poi'] == 1:
count += 1
print (count)
#print (enron_data['SKILLING JEFFREY K'])
poi = [x for x, y in enron_data.items() if y['poi']]
#print(poi)
print(len(poi))
#How many POI’s were there total?
poi_text = '../final_project/poi_names.txt'
poi_names = open(poi_text, "r")
fr = poi_names.readlines()
#print(fr)
print(len(fr[2:]))
poi_names.close()
#What is the total value of the stock belonging to James Prentice?
print(enron_data['PRENTICE JAMES'])
print(enron_data['COLWELL WESLEY'])
print(enron_data['SKILLING JEFFREY K'])
#How much money did that person get?
names = ['SKILLING JEFFREY K', 'FASTOW ANDREW S', 'LAY KENNETH L']
names_payments = {name:enron_data[name]['total_payments'] for name in names}
print (sorted(names_payments.items(), key=lambda x: x[1], reverse=True))
#How many folks in this dataset have a quantified salary? What about a known email address?
import pandas as pd
df = pd.DataFrame(enron_data)
#print(df)
print(sum(df.loc['salary',:]!='NaN'))
print(sum(df.loc['email_address',:] != 'NaN'))