From bb6857711ac68c7dc8cbd8d703130a6599373253 Mon Sep 17 00:00:00 2001
From: Coulibaly Zie Mamadou <17972148+zie225@users.noreply.github.com>
Date: Sat, 18 Mar 2023 11:48:19 +0100
Subject: [PATCH] Add files via upload
---
.gitignore | 14 +++++
LICENSE | 21 +++++++
README.md | 28 +++++++++
config.env | 15 +++++
docker-compose.yml | 65 +++++++++++++++++++
makefile | 40 ++++++++++++
ml_docker_mongodb_kafka.txt | 120 ++++++++++++++++++++++++++++++++++++
7 files changed, 303 insertions(+)
create mode 100644 .gitignore
create mode 100644 LICENSE
create mode 100644 README.md
create mode 100644 config.env
create mode 100644 docker-compose.yml
create mode 100644 makefile
create mode 100644 ml_docker_mongodb_kafka.txt
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a171333
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,14 @@
+venv/
+.idea/
+airflow-webserver*
+logs/
+*.cfg
+__pycache__/
+.DS_Store
+
+docker/.bash_history/history
+dags/ml_project/models/*
+!dags/ml_project/models/.gitkeep
+
+config.env
+docker/scripts/config.ini
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..c400e17
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 Danylo Baibak
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..5e136eb
--- /dev/null
+++ b/README.md
@@ -0,0 +1,28 @@
+# Machine Learning in production using Apache Airflow
+
+To build a solution using Machine Learning is a complex task by itself. Whilst academic Machine Learning has its roots in research from the 1980s, the practical implementation of Machine Learning Systems in production is still relatively new.
+
+This project is an example of how you can improve the two parts of any Machine Learning project - Data Validation and Model Evaluation. The goal is to share practical ideas, that you can introduce in your project relatively simple, but still achieve great benefits.
+
+* **Data Validation** is the process of ensuring that data is present, correct, and meaningful. Ensuring the quality of your data through automated validation checks is a critical step in building data pipelines at any organization.
+* **Model validation** occurs after you successfully train the model given the new data. We evaluate and validate the model before it's promoted to production. Ideally, the offline model validation step should include.
+
+
+
+
+
+You can read more details in the [article on Medium](https://medium.com/@danil.baibak/machine-learning-in-production-using-apache-airflow-91d25a4d8152).
+
+## Installation
+
+The project is dockerized and you have two options to run it:
+* `make pull` - the [prebuilt image](https://hub.docker.com/r/dbaibak/docker_airflow) will be pulled from the Docker Hub;
+* `make build` - you can also build the [Docker image](https://github.com/DanilBaibak/ml-in-production/tree/master/docker) by yourself;
+* `make init_config` will initialize all necessary configs;
+* `make up_d` will start up your application detached mode. After the application is started, you can easily have access to the project by the link http://localhost:8080/
+
+## Useage
+
+* `make bash` will create a new Bash session in the container.
+* `make stop` stops running containers without removing them.
+* `make down` stops and removes containers.
diff --git a/config.env b/config.env
new file mode 100644
index 0000000..f0415fa
--- /dev/null
+++ b/config.env
@@ -0,0 +1,15 @@
+CODEBASE_PULL_SCHEDULE=*/5 * * * *
+
+AIRFLOW__WEBSERVER__DAG_DEFAULT_VIEW=graph
+AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL=30
+AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL=30
+AIRFLOW__SCHEDULER__CATCHUP_BY_DEFAULT=False
+AIRFLOW__SCHEDULER__SCHEDULER_MAX_THREADS=1
+AIRFLOW__CORE__DAGBAG_IMPORT_TIMEOUT=90
+
+AIRFLOW__CORE__EXECUTOR=LocalExecutor
+AIRFLOW__CORE__LOAD_EXAMPLES=False
+AIRFLOW__CORE__REMOTE_LOGGING=False
+
+AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://service_postgres_airflow:5432/airflow?password=adminpassword&user=adminuser
+AIRFLOW_CONN_DEV_POSTGRES=postgresql://postgres_user:postgres_password@service_postgres_dev/development
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..4ed6a8b
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,65 @@
+version: '3'
+x-airflow-common:
+ &airflow-common
+ build:
+ context: .
+ dockerfile: docker\Dockerfile
+
+
+ image: apache/airflow:2.3.4
+ environment:
+ - AIRFLOW__CORE__EXECUTOR=LocalExecutor
+ - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow
+ - AIRFLOW__CORE__FERNET_KEY=FB0o_zt4e3Ziq3LdUUO7F2Z95cvFFx16hU8jTeR1ASM=
+ - AIRFLOW__CORE__LOAD_EXAMPLES=False
+ - AIRFLOW__CORE__LOGGING_LEVEL=INFO
+ volumes:
+ - ./dags:/opt/airflow/dags
+ - ./airflow-data/logs:/opt/airflow/logs
+ - ./airflow-data/plugins:/opt/airflow/plugins
+ - ./airflow-data/airflow.cfg:/opt/airlfow/airflow.cfg
+ - ./data:/opt/airflow/data
+ - ./models:/opt/airflow/models
+ depends_on:
+ - postgres
+
+services:
+ postgres:
+ image: postgres:12
+ environment:
+ - POSTGRES_USER=airflow
+ - POSTGRES_PASSWORD=airflow
+ - POSTGRES_DB=airflow
+ - POSTGRES_PORT=5432
+ ports:
+ - "5432:5432"
+
+ airflow-init:
+ << : *airflow-common
+ container_name: airflow_init
+ entrypoint: /bin/bash
+ command:
+ - -c
+ - airflow users list || ( airflow db init &&
+ airflow users create
+ --role Admin
+ --username airflow
+ --password airflow
+ --email airflow@airflow.com
+ --firstname airflow
+ --lastname airflow )
+ restart: on-failure
+
+ airflow-webserver:
+ << : *airflow-common
+ command: airflow webserver
+ ports:
+ - 8080:8080
+ container_name: airflow_webserver
+ restart: always
+
+ airflow-scheduler:
+ << : *airflow-common
+ command: airflow scheduler
+ container_name: airflow_scheduler
+ restart: always
diff --git a/makefile b/makefile
new file mode 100644
index 0000000..df8d569
--- /dev/null
+++ b/makefile
@@ -0,0 +1,40 @@
+SHELL=/bin/bash
+
+pull:
+ @docker pull dbaibak/docker_airflow:latest
+
+init_config:
+ cp config.env.public config.env
+ cp docker/scripts/config.ini.public docker/scripts/config.ini
+
+build:
+ docker build ./docker -t dbaibak/docker_airflow
+
+up:
+ docker-compose up
+ docker ps
+
+up_d:
+ docker-compose up -d
+ docker ps
+
+stop:
+ docker-compose stop
+
+down:
+ docker-compose down --rmi local --volumes
+ make clean
+
+bash:
+ docker exec -it airflow_pipeline bash -c "cd airflow_home; bash"
+
+clean_airflow:
+ rm -rf */airflow-webserver*
+ rm -rf */airflow.cfg
+ rm -rf */unittests.cfg
+
+clean:
+ make clean_airflow
+ find . | grep -E "(__pycache__|\.pyc|\.pyo$\)" | xargs rm -rf
+ rm -rf .mypy_cache
+ rm -rf .pytest_cache
diff --git a/ml_docker_mongodb_kafka.txt b/ml_docker_mongodb_kafka.txt
new file mode 100644
index 0000000..25d9041
--- /dev/null
+++ b/ml_docker_mongodb_kafka.txt
@@ -0,0 +1,120 @@
+1. creer un environnement python 3.8 :
+# conda create -n ml-mongodb anaconda python=3.9.0
+# conda create -n ml-mongodb anaconda python=3.8.0
+# conda activate ml-mongodb
+# conda install ipykernel
+# python -m ipykernel install --user --name ml-mongodb --display-name "Env ml-mongodb"
+# pip install jupyter
+# pip install pycaret[full]==2.3.10
+# pip install kafka==1.3.5
+# pip install kafka-python
+# pip install numpy==1.19.5
+# pip install pandas==1.4.3
+# pip install pymongo==3.12.0
+
+Une fois docker yml configurer,
+
+Une fois que c’est fait, ouvrez votre terminal de commande, naviguez jusqu’au dossier où vous avez sauvegardé
+docker-compose.yml et lancez la commande suivante :
+
+docker-compose -f docker-compose.yml up -d
+docker-compose down
+
+
+
+Un téléchargement devrait se lancer. Une fois qu’il est terminé, lancez la commander docker ps pour vérifier que
+tout s’est bien lancé.
+
+2.Data_generator
+
+Data generator pour les donnees ou collecte de donnees
+
+3.Producer
+
+producer pour le data streaming, envoyer les messages( data) au broker(topic) creer sur http://localhost:9000/
+
+que consumer va recevoir en messages.
+
+4.Consumer
+
+consumer pour le streaming de machine learning, d'ou ML pipeline, consumer recoit les messages envoyer par
+producer pour le stream de ML
+
+
+Airflow : https://github.com/hamed225/airbender
+Airflow for image : https://github.com/btphan95/greenr-airflow/tree/master/scripts
+https://github.com/rahul765/Machine-Learning-Pipelines#readme
+
+airflow:https://github.com/NicoloAlbanese/airflow-ml-pipeline-mvp ( my beste )
+
+
+airflow: https://github.com/hamed225/ml-in-production (mon seconde meilleur )
+
+
+Avec l'installation des packages python qu'on a besoin pour Apache airflow, nous devrons parametrer le dockerfile pour les
+installations, ensuite mettre le dockerfile dans le fichier docker-compose.yml comme suite:
+
+version: '3'
+x-airflow-common:
+ &airflow-common
+ build:
+ context: .
+ dockerfile: docker\Dockerfile
+
+Le lien du parametre : https://airflow.apache.org/docs/docker-stack/build.html#quick-start-scenarios-of-image-extending
+
+-./data:/opt/airflow/data
+- ./models:/opt/airflow/models
+
+Toujours ajouter ces deux dernier le docker-compose.yml , la partie volume
+
+meilleur docker-compose.yml : https://github.com/marclamberti/docker-airflow/blob/main/docker-compose.yml
+
+postgresql+psycopg2://user:password@postgres/db
+
+https://towardsdatascience.com/end-to-end-machine-learning-pipeline-with-docker-and-apache-airflow-from-scratch-35f6a75f57ad
+
+
+ AVANT , IL FAUT TOUJOURS CREER UN ENVIRONEMENT SUR DOCKER AVEC LE DOSSIER DE LA PIPELINE D'AIRFLOW
+docker container exec -it airflow_webserver bash
+
+# Checking the saved models
+cd /opt/airflow/models
+ls -l
+
+# Checking the saved data
+cd /opt/airflow/data
+ls -l
+
+python
+import pandas as pd
+from sqlalchemy import create_engine
+engine = create_engine('postgresql+psycopg2://airflow:airflow@postgres/airflow')
+pd.read_sql('SELECT * FROM experiments', engine)
+
+pd.read_sql('SELECT * FROM batch_data', engine)
+
+
+
+ML_FLOW
+
+https://github.com/burakince/mlflow
+
+https://github.com/hamed225/incremental_training
+
+https://github.com/zie225/mlflow_docker-compose
+
+
+http://localhost:9021/login
+
+
+
+
+ Copy this key and put it in a secure place. You won't be able to view this key again after closing this modal:
+
+pnu_zwVwqfD2e1xQikC9ZO5OSqSpBXCScL2azsPp
+
+Or run this command to save the key to your active profile:
+
+
+prefect cloud login -k pnu_zwVwqfD2e1xQikC9ZO5OSqSpBXCScL2azsPp