forked from FilippoBovo/production-data-science
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pipelines.py
36 lines (27 loc) · 1.14 KB
/
pipelines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import logging
import pandas as pd
from titanic import data, models
def run_titanic_analysis(filename):
"""Data pipeline and predictions.
Parameters
----------
filename: str
Path to the Titanic CSV input data
"""
logging.info('Starting the data analysis pipeline')
processed_data = (
pd.read_csv(filename, usecols=['Name', 'Sex', 'Age', 'Survived'])
.pipe(lambda df: df.fillna({'Age': df.Age.median(), }))
.pipe(lambda df: df.astype({'Age': 'float64',
'Name': 'object',
'Sex': 'category',
'Survived': 'int64'}))
.pipe(data.extract_title)
)
X_train, X_test, y_train, y_test = models.data_preparation(processed_data,
test_size=0.2,
random_state=0)
models.run_majority_vote(X_train, X_test, y_train, y_test)
models.run_logistic_regression(X_train, X_test, y_train, y_test)
logging.info('The data analysis pipeline has terminated')
return