From bb6857711ac68c7dc8cbd8d703130a6599373253 Mon Sep 17 00:00:00 2001 From: Coulibaly Zie Mamadou <17972148+zie225@users.noreply.github.com> Date: Sat, 18 Mar 2023 11:48:19 +0100 Subject: [PATCH] Add files via upload --- .gitignore | 14 +++++ LICENSE | 21 +++++++ README.md | 28 +++++++++ config.env | 15 +++++ docker-compose.yml | 65 +++++++++++++++++++ makefile | 40 ++++++++++++ ml_docker_mongodb_kafka.txt | 120 ++++++++++++++++++++++++++++++++++++ 7 files changed, 303 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 config.env create mode 100644 docker-compose.yml create mode 100644 makefile create mode 100644 ml_docker_mongodb_kafka.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a171333 --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +venv/ +.idea/ +airflow-webserver* +logs/ +*.cfg +__pycache__/ +.DS_Store + +docker/.bash_history/history +dags/ml_project/models/* +!dags/ml_project/models/.gitkeep + +config.env +docker/scripts/config.ini \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..c400e17 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Danylo Baibak + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..5e136eb --- /dev/null +++ b/README.md @@ -0,0 +1,28 @@ +# Machine Learning in production using Apache Airflow + +To build a solution using Machine Learning is a complex task by itself. Whilst academic Machine Learning has its roots in research from the 1980s, the practical implementation of Machine Learning Systems in production is still relatively new. + +This project is an example of how you can improve the two parts of any Machine Learning project - Data Validation and Model Evaluation. The goal is to share practical ideas, that you can introduce in your project relatively simple, but still achieve great benefits. + +* **Data Validation** is the process of ensuring that data is present, correct, and meaningful. Ensuring the quality of your data through automated validation checks is a critical step in building data pipelines at any organization. +* **Model validation** occurs after you successfully train the model given the new data. We evaluate and validate the model before it's promoted to production. Ideally, the offline model validation step should include. + +

+ +

+ +You can read more details in the [article on Medium](https://medium.com/@danil.baibak/machine-learning-in-production-using-apache-airflow-91d25a4d8152). + +## Installation + +The project is dockerized and you have two options to run it: +* `make pull` - the [prebuilt image](https://hub.docker.com/r/dbaibak/docker_airflow) will be pulled from the Docker Hub; +* `make build` - you can also build the [Docker image](https://github.com/DanilBaibak/ml-in-production/tree/master/docker) by yourself; +* `make init_config` will initialize all necessary configs; +* `make up_d` will start up your application detached mode. After the application is started, you can easily have access to the project by the link http://localhost:8080/ + +## Useage + +* `make bash` will create a new Bash session in the container. +* `make stop` stops running containers without removing them. +* `make down` stops and removes containers. diff --git a/config.env b/config.env new file mode 100644 index 0000000..f0415fa --- /dev/null +++ b/config.env @@ -0,0 +1,15 @@ +CODEBASE_PULL_SCHEDULE=*/5 * * * * + +AIRFLOW__WEBSERVER__DAG_DEFAULT_VIEW=graph +AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL=30 +AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL=30 +AIRFLOW__SCHEDULER__CATCHUP_BY_DEFAULT=False +AIRFLOW__SCHEDULER__SCHEDULER_MAX_THREADS=1 +AIRFLOW__CORE__DAGBAG_IMPORT_TIMEOUT=90 + +AIRFLOW__CORE__EXECUTOR=LocalExecutor +AIRFLOW__CORE__LOAD_EXAMPLES=False +AIRFLOW__CORE__REMOTE_LOGGING=False + +AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://service_postgres_airflow:5432/airflow?password=adminpassword&user=adminuser +AIRFLOW_CONN_DEV_POSTGRES=postgresql://postgres_user:postgres_password@service_postgres_dev/development diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..4ed6a8b --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,65 @@ +version: '3' +x-airflow-common: + &airflow-common + build: + context: . + dockerfile: docker\Dockerfile + + + image: apache/airflow:2.3.4 + environment: + - AIRFLOW__CORE__EXECUTOR=LocalExecutor + - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow + - AIRFLOW__CORE__FERNET_KEY=FB0o_zt4e3Ziq3LdUUO7F2Z95cvFFx16hU8jTeR1ASM= + - AIRFLOW__CORE__LOAD_EXAMPLES=False + - AIRFLOW__CORE__LOGGING_LEVEL=INFO + volumes: + - ./dags:/opt/airflow/dags + - ./airflow-data/logs:/opt/airflow/logs + - ./airflow-data/plugins:/opt/airflow/plugins + - ./airflow-data/airflow.cfg:/opt/airlfow/airflow.cfg + - ./data:/opt/airflow/data + - ./models:/opt/airflow/models + depends_on: + - postgres + +services: + postgres: + image: postgres:12 + environment: + - POSTGRES_USER=airflow + - POSTGRES_PASSWORD=airflow + - POSTGRES_DB=airflow + - POSTGRES_PORT=5432 + ports: + - "5432:5432" + + airflow-init: + << : *airflow-common + container_name: airflow_init + entrypoint: /bin/bash + command: + - -c + - airflow users list || ( airflow db init && + airflow users create + --role Admin + --username airflow + --password airflow + --email airflow@airflow.com + --firstname airflow + --lastname airflow ) + restart: on-failure + + airflow-webserver: + << : *airflow-common + command: airflow webserver + ports: + - 8080:8080 + container_name: airflow_webserver + restart: always + + airflow-scheduler: + << : *airflow-common + command: airflow scheduler + container_name: airflow_scheduler + restart: always diff --git a/makefile b/makefile new file mode 100644 index 0000000..df8d569 --- /dev/null +++ b/makefile @@ -0,0 +1,40 @@ +SHELL=/bin/bash + +pull: + @docker pull dbaibak/docker_airflow:latest + +init_config: + cp config.env.public config.env + cp docker/scripts/config.ini.public docker/scripts/config.ini + +build: + docker build ./docker -t dbaibak/docker_airflow + +up: + docker-compose up + docker ps + +up_d: + docker-compose up -d + docker ps + +stop: + docker-compose stop + +down: + docker-compose down --rmi local --volumes + make clean + +bash: + docker exec -it airflow_pipeline bash -c "cd airflow_home; bash" + +clean_airflow: + rm -rf */airflow-webserver* + rm -rf */airflow.cfg + rm -rf */unittests.cfg + +clean: + make clean_airflow + find . | grep -E "(__pycache__|\.pyc|\.pyo$\)" | xargs rm -rf + rm -rf .mypy_cache + rm -rf .pytest_cache diff --git a/ml_docker_mongodb_kafka.txt b/ml_docker_mongodb_kafka.txt new file mode 100644 index 0000000..25d9041 --- /dev/null +++ b/ml_docker_mongodb_kafka.txt @@ -0,0 +1,120 @@ +1. creer un environnement python 3.8 : +# conda create -n ml-mongodb anaconda python=3.9.0 +# conda create -n ml-mongodb anaconda python=3.8.0 +# conda activate ml-mongodb +# conda install ipykernel +# python -m ipykernel install --user --name ml-mongodb --display-name "Env ml-mongodb" +# pip install jupyter +# pip install pycaret[full]==2.3.10 +# pip install kafka==1.3.5 +# pip install kafka-python +# pip install numpy==1.19.5 +# pip install pandas==1.4.3 +# pip install pymongo==3.12.0 + +Une fois docker yml configurer, + +Une fois que c’est fait, ouvrez votre terminal de commande, naviguez jusqu’au dossier où vous avez sauvegardé +docker-compose.yml et lancez la commande suivante : + +docker-compose -f docker-compose.yml up -d +docker-compose down + + + +Un téléchargement devrait se lancer. Une fois qu’il est terminé, lancez la commander docker ps pour vérifier que +tout s’est bien lancé. + +2.Data_generator + +Data generator pour les donnees ou collecte de donnees + +3.Producer + +producer pour le data streaming, envoyer les messages( data) au broker(topic) creer sur http://localhost:9000/ + +que consumer va recevoir en messages. + +4.Consumer + +consumer pour le streaming de machine learning, d'ou ML pipeline, consumer recoit les messages envoyer par +producer pour le stream de ML + + +Airflow : https://github.com/hamed225/airbender +Airflow for image : https://github.com/btphan95/greenr-airflow/tree/master/scripts +https://github.com/rahul765/Machine-Learning-Pipelines#readme + +airflow:https://github.com/NicoloAlbanese/airflow-ml-pipeline-mvp ( my beste ) + + +airflow: https://github.com/hamed225/ml-in-production (mon seconde meilleur ) + + +Avec l'installation des packages python qu'on a besoin pour Apache airflow, nous devrons parametrer le dockerfile pour les +installations, ensuite mettre le dockerfile dans le fichier docker-compose.yml comme suite: + +version: '3' +x-airflow-common: + &airflow-common + build: + context: . + dockerfile: docker\Dockerfile + +Le lien du parametre : https://airflow.apache.org/docs/docker-stack/build.html#quick-start-scenarios-of-image-extending + +-./data:/opt/airflow/data +- ./models:/opt/airflow/models + +Toujours ajouter ces deux dernier le docker-compose.yml , la partie volume + +meilleur docker-compose.yml : https://github.com/marclamberti/docker-airflow/blob/main/docker-compose.yml + +postgresql+psycopg2://user:password@postgres/db + +https://towardsdatascience.com/end-to-end-machine-learning-pipeline-with-docker-and-apache-airflow-from-scratch-35f6a75f57ad + + + AVANT , IL FAUT TOUJOURS CREER UN ENVIRONEMENT SUR DOCKER AVEC LE DOSSIER DE LA PIPELINE D'AIRFLOW +docker container exec -it airflow_webserver bash + +# Checking the saved models +cd /opt/airflow/models +ls -l + +# Checking the saved data +cd /opt/airflow/data +ls -l + +python +import pandas as pd +from sqlalchemy import create_engine +engine = create_engine('postgresql+psycopg2://airflow:airflow@postgres/airflow') +pd.read_sql('SELECT * FROM experiments', engine) + +pd.read_sql('SELECT * FROM batch_data', engine) + + + +ML_FLOW + +https://github.com/burakince/mlflow + +https://github.com/hamed225/incremental_training + +https://github.com/zie225/mlflow_docker-compose + + +http://localhost:9021/login + + + + + Copy this key and put it in a secure place. You won't be able to view this key again after closing this modal: + +pnu_zwVwqfD2e1xQikC9ZO5OSqSpBXCScL2azsPp + +Or run this command to save the key to your active profile: + + +prefect cloud login -k pnu_zwVwqfD2e1xQikC9ZO5OSqSpBXCScL2azsPp