-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy path0_raw2features_brazilbus.py
54 lines (48 loc) · 1.9 KB
/
0_raw2features_brazilbus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import re
import sys
import unicodedata
import pandas as pd
import networkx as nx
import numpy as np
from tqdm import tqdm
from datetime import date
from glob import glob
from collections import defaultdict
sys.path.append('src')
from features_extractor import features_extractor_single
def match_datasets(data):
allcities=pd.read_csv("/data/buses_list_of_cities.csv",index_col=0,names=["CityUF"],encoding="utf-8")
allcities["CityUF"]=allcities["CityUF"].str.upper()
allcities["CityUF"]=allcities["CityUF"].str.strip()
allcities["CityUF"]=allcities["CityUF"].str.replace(" , ",", ")
allcities["CityUF"]=allcities["CityUF"].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
setallcities=set(list(allcities.CityUF.astype(str)))
data=data[data['ORIGEM'].isin(list(setallcities))]
data=data[data['DESTINO'].isin(list(setallcities))]
return data
def bus_network(year=2010,month=12):
data=pd.read_csv("/data/clean_brazilbus_data/{}.csv".format(year),index_col=None)
if month is not False:
data=data[data.MES==month]
data=data[data.NUMEROLUGAROFERTADOIDA>0]
data=match_datasets(data)
data=data[["ORIGEM","DESTINO",'NUMEROVIAGEMIDA']]
data = data.groupby(["ORIGEM","DESTINO"]).sum().reset_index()
data=data[data.NUMEROVIAGEMIDA>0]
data=data.rename(columns={'ORIGEM':'source','DESTINO':'target','NUMEROVIAGEMIDA':'weight'})
return data
if __name__ = "__main__":
feature_file = '/data/features/bus_2005_2015.csv'
graphs_bus = []
date_bus = []
for y in range(2005, 2015):
for m in range(1, 13):
try:
df = bus_network(y, m)
except:
break
date_bus.append(date(y, m, 1))
G = nx.from_pandas_edgelist(df, edge_attr=True)
graphs_bus.append(G)
features = features_extractor(graphs_bus, date_bus)
features.to_csv(feature_file)