diff --git a/404.html b/404.html index 419baf6..720138b 100644 --- a/404.html +++ b/404.html @@ -7,14 +7,14 @@ Page Not Found | THE DATA ENGINEERING COOKBOOK - +

Page Not Found

We could not find what you were looking for.

Please contact the owner of the site that linked you to the original URL and let them know their link is broken.

- + \ No newline at end of file diff --git a/assets/js/a4b6b237.47726173.js b/assets/js/a4b6b237.394c5a42.js similarity index 98% rename from assets/js/a4b6b237.47726173.js rename to assets/js/a4b6b237.394c5a42.js index f62d3c0..115f615 100644 --- a/assets/js/a4b6b237.47726173.js +++ b/assets/js/a4b6b237.394c5a42.js @@ -1 +1 @@ -(window.webpackJsonp=window.webpackJsonp||[]).push([[8],{76:function(e,t,a){"use strict";a.r(t),a.d(t,"frontMatter",(function(){return c})),a.d(t,"metadata",(function(){return o})),a.d(t,"toc",(function(){return l})),a.d(t,"default",(function(){return b}));var n=a(3),r=a(7),i=(a(0),a(88)),c={},o={unversionedId:"10-Updates",id:"10-Updates",isDocsHomePage:!1,title:"10-Updates",description:"Updates",source:"@site/docs/10-Updates.md",slug:"/10-Updates",permalink:"/docs/10-Updates",version:"current",sidebar:"projectsSidebar",previous:{title:"09-BooksAndCourses",permalink:"/docs/09-BooksAndCourses"}},l=[],s={toc:l};function b(e){var t=e.components,a=Object(r.a)(e,["components"]);return Object(i.b)("wrapper",Object(n.a)({},s,a,{components:t,mdxType:"MDXLayout"}),Object(i.b)("h1",{id:"updates"},"Updates"),Object(i.b)("p",null,"What's new? Here you can find a list of all the updates with links to the sections"),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-11-23"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Prepared a GenAI RAG example project that you can run on your own computer without internet. It uses Ollama with Mistral model and ElasticSearch. Working on a way of creating embeddings from pdf files and inserting them into ElsaticSearch for queries ",Object(i.b)("a",{parentName:"li",href:"/docs/04-HandsOnCourse#genai-retrieval-augmented-generation-with-ollama-and-elasticsearch"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-11-23"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Added an overview of AWS and Azure cloud certifications for Data Engineers. From beginners to experts ",Object(i.b)("a",{parentName:"li",href:"/docs/09-BooksAndCourses#Certifications"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-07-31"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},'Added 10 platform architecture react videos I did to the "Best Practices" section. This way you get a better feeling of what companies are doing and which tools they use ',Object(i.b)("a",{parentName:"li",href:"/docs/06-BestPracticesCloud#best-practices"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-07-17"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Added 20 API interview questoins and their answers ",Object(i.b)("a",{parentName:"li",href:"/docs/08-InterviewQuestions#apis"},"click here")),Object(i.b)("li",{parentName:"ul"},"Added 10 Python interview questions and their answers ",Object(i.b)("a",{parentName:"li",href:"/docs/03-AdvancedSkills#python"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-07-08"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Added large article about Snowflake and dbt for Data Engineers ",Object(i.b)("a",{parentName:"li",href:"/docs/03-AdvancedSkills#analytical-data-stores"},"click here")),Object(i.b)("li",{parentName:"ul"},'Added new secton "Analytical Data Stores" to Advanced skills with the Snowflake & dbt infos.'),Object(i.b)("li",{parentName:"ul"},'Put SQL and NoSQL datastores into a new section "Transactional Data Stores"')))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-03-20"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Added roadmap for Software Engineers / Computer Scientists ",Object(i.b)("a",{parentName:"li",href:"/docs/01-Introduction#roadmap-for-software-engineers"},"click here")),Object(i.b)("li",{parentName:"ul"},"Added many questions and answers from my interview on the Super Data Science Podcast (plus links to YouTube and the Podcast) ",Object(i.b)("a",{parentName:"li",href:"/docs/01-Introduction#Interview-with-Andreas-on-the-Super-Data-Science-Podcast"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-03-13"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},'Added "How to become a Senior Data Engineer" live stream series as a blog post with images shown in the live streams and the links to the videos. ',Object(i.b)("a",{parentName:"li",href:"/docs/01-Introduction#how-to-become-a-senior-data-engineer"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-03-08"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Included Data Engineering skills matrix into the introduction with link to the live stream. ",Object(i.b)("a",{parentName:"li",href:"/docs/01-Introduction#data-engineers-skills-matrix"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-03-01"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Added updates section"),Object(i.b)("li",{parentName:"ul"},"Reworked the Hands-on courses section with 5 free courses / tutorials from Andreas on YouTube ",Object(i.b)("a",{parentName:"li",href:"/docs/04-HandsOnCourse"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-02-28"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Added Data Engineering Roadmap for Data Scientists: ",Object(i.b)("a",{parentName:"li",href:"/docs/01-Introduction#roadmap-for-data-scientists"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-02-25"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Data Engineering Roadmap for Software Engineers: ",Object(i.b)("a",{parentName:"li",href:"/docs/01-Introduction#roadmap-for-software-engineers"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-02-20"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Data Engineering Roadmap for Data Analysts: ",Object(i.b)("a",{parentName:"li",href:"/docs/01-Introduction#roadmap-for-data-analysts"},"click here"))))))}b.isMDXComponent=!0},88:function(e,t,a){"use strict";a.d(t,"a",(function(){return u})),a.d(t,"b",(function(){return m}));var n=a(0),r=a.n(n);function i(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function c(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,n)}return a}function o(e){for(var t=1;t=0||(r[a]=e[a]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(r[a]=e[a])}return r}var s=r.a.createContext({}),b=function(e){var t=r.a.useContext(s),a=t;return e&&(a="function"==typeof e?e(t):o(o({},t),e)),a},u=function(e){var t=b(e.components);return r.a.createElement(s.Provider,{value:t},e.children)},d={inlineCode:"code",wrapper:function(e){var t=e.children;return r.a.createElement(r.a.Fragment,{},t)}},p=r.a.forwardRef((function(e,t){var a=e.components,n=e.mdxType,i=e.originalType,c=e.parentName,s=l(e,["components","mdxType","originalType","parentName"]),u=b(a),p=n,m=u["".concat(c,".").concat(p)]||u[p]||d[p]||i;return a?r.a.createElement(m,o(o({ref:t},s),{},{components:a})):r.a.createElement(m,o({ref:t},s))}));function m(e,t){var a=arguments,n=t&&t.mdxType;if("string"==typeof e||n){var i=a.length,c=new Array(i);c[0]=p;var o={};for(var l in t)hasOwnProperty.call(t,l)&&(o[l]=t[l]);o.originalType=e,o.mdxType="string"==typeof e?e:n,c[1]=o;for(var s=2;s=0||(r[a]=e[a]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(r[a]=e[a])}return r}var s=r.a.createContext({}),b=function(e){var t=r.a.useContext(s),a=t;return e&&(a="function"==typeof e?e(t):o(o({},t),e)),a},u=function(e){var t=b(e.components);return r.a.createElement(s.Provider,{value:t},e.children)},d={inlineCode:"code",wrapper:function(e){var t=e.children;return r.a.createElement(r.a.Fragment,{},t)}},p=r.a.forwardRef((function(e,t){var a=e.components,n=e.mdxType,i=e.originalType,c=e.parentName,s=l(e,["components","mdxType","originalType","parentName"]),u=b(a),p=n,m=u["".concat(c,".").concat(p)]||u[p]||d[p]||i;return a?r.a.createElement(m,o(o({ref:t},s),{},{components:a})):r.a.createElement(m,o({ref:t},s))}));function m(e,t){var a=arguments,n=t&&t.mdxType;if("string"==typeof e||n){var i=a.length,c=new Array(i);c[0]=p;var o={};for(var l in t)hasOwnProperty.call(t,l)&&(o[l]=t[l]);o.originalType=e,o.mdxType="string"==typeof e?e:n,c[1]=o;for(var s=2;s sends you to the browser")),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"For Windows Users"),"\nConfigure WSL2 to use max only 4GB of ram:"),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},'wsl --shutdown\nnotepad "$env:USERPROFILE/.wslconfig"\n')),Object(o.b)("p",null,".wslconfig file:"),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},"[wsl2]\nmemory=4GB # Limits VM memory in WSL 2 up to 4GB\n")),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"Modify the Linux kernel map count in WSL"),"\nDo this before the start because Elasticsearch requires a higher value to work\n",Object(o.b)("inlineCode",{parentName:"p"},"sudo sysctl -w vm.max_map_count=262144")),Object(o.b)("ol",{start:4},Object(o.b)("li",{parentName:"ol"},"go to the Elasticsearch-RAG folder and do ",Object(o.b)("inlineCode",{parentName:"li"},"docker compose up")),Object(o.b)("li",{parentName:"ol"},"make sure you have Elasticsearch 8.11 or later (we use 8.16 here in this project) if you want to use your own Elasticsearch image"),Object(o.b)("li",{parentName:"ol"},"if you get this error on a mac then just open the console in the docker app: ",Object(o.b)("em",{parentName:"li"},"error getting credentials - err: exec: docker-credential-desktop: executable file not found in $PATH, out:")),Object(o.b)("li",{parentName:"ol"},"Install xcode command line tools: ",Object(o.b)("inlineCode",{parentName:"li"},"xcode-select --install")),Object(o.b)("li",{parentName:"ol"},"make sure you're at python 3.8.1 or larger -> installed 3.13.0 from ",Object(o.b)("a",{parentName:"li",href:"https://www.python.org/downloads/"},"https://www.python.org/downloads/"))),Object(o.b)("h3",{id:"setup-the-virtual-python-environment"},"Setup the virtual Python environment"),Object(o.b)("h4",{id:"preparation-on-a-mac"},"preparation on a Mac"),Object(o.b)("h5",{id:"install-brew"},"install brew"),Object(o.b)("p",null,'which brew\n/bin/bash -c "$(curl -fsSL ',Object(o.b)("a",{parentName:"p",href:"https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)%22"},'https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"'),'\nexport PATH="/opt/homebrew/bin:$PATH"\nbrew --version\nbrew install pyenv\nbrew install pyenv-virtualenv'),Object(o.b)("h5",{id:"install-pyenv"},"install pyenv"),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},"brew install pyenv\nbrew install pyenv-virtualenv\n")),Object(o.b)("p",null,"Modify the path so that pyenv is in the path variable\n",Object(o.b)("inlineCode",{parentName:"p"},"nano ~/.zshrc")),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},'export PYENV_ROOT="$HOME/.pyenv"\nexport PATH="$PYENV_ROOT/bin:$PATH"\neval "$(pyenv init --path)"\neval "$(pyenv init -)"\neval "$(pyenv virtualenv-init -)"\n')),Object(o.b)("p",null,"install dependencies for building python versions\n",Object(o.b)("inlineCode",{parentName:"p"},"brew install openssl readline sqlite3 xz zlib")),Object(o.b)("p",null,"Reload to apply changes\n",Object(o.b)("inlineCode",{parentName:"p"},"source ~/.zshrc")),Object(o.b)("p",null,"install python"),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},"pyenv install 3.11.6\npyenv version\n")),Object(o.b)("p",null,"Set Python version system wide\n",Object(o.b)("inlineCode",{parentName:"p"},"pyenv global 3.11.6")),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},"pyenv virtualenv \npyenv activate \npyenv virtualenv-delete \n")),Object(o.b)("h4",{id:"windows-without-pyenv"},"Windows without pyenv"),Object(o.b)("p",null,"setup virtual python environment - go to the Elasticsearch-RAG folder and do\n",Object(o.b)("inlineCode",{parentName:"p"},"python3 -m venv .elkrag"),"\nenable the environment\n",Object(o.b)("inlineCode",{parentName:"p"},"source .elkrag/bin/activate")),Object(o.b)("h3",{id:"install-required-libraries-do-one-at-a-time-so-you-see-errors"},"Install required libraries (do one at a time so you see errors):"),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},"pip install llama-index (optional python3 -m pip install package name)\npip install llama-index-embeddings-ollama\npip install llama-index-llms-ollama\npip install llama-index-vector-stores-elasticsearch\npip install python-dotenv\n")),Object(o.b)("h3",{id:"write-the-data-to-elasticsearch"},"Write the data to Elasticsearch"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},"create / copy in the index.py file"),Object(o.b)("li",{parentName:"ol"},"download the conversations.json file from the folder code examples/GenAI-RAG"),Object(o.b)("li",{parentName:"ol"},"if you get an error with the execution then check if pedantic version is <2.0 ",Object(o.b)("inlineCode",{parentName:"li"},"pip show pydantic")," if not do this: ",Object(o.b)("inlineCode",{parentName:"li"},'pip install "pydantic<2.0')),Object(o.b)("li",{parentName:"ol"},"run the program index.py: ",Object(o.b)("a",{parentName:"li",href:"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/index.py"},"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/index.py"))),Object(o.b)("h3",{id:"check-the-data-in-elasticsearch"},"Check the data in Elasticsearch"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},"go to kibana http://localhost:5601/app/management/data/index_management/indices and see the new index called calls"),Object(o.b)("li",{parentName:"ol"},"go to dev tools and try out this query ",Object(o.b)("inlineCode",{parentName:"li"},"GET calls/_search?size=1 http://localhost:5601/app/dev_tools#/console/shell"))),Object(o.b)("h3",{id:"query-data-from-elasticsearch-and-create-an-output-with-mistral"},"Query data from elasticsearch and create an output with Mistral"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},"if everything is good then run the query.py file ",Object(o.b)("a",{parentName:"li",href:"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/query.py"},"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/query.py")),Object(o.b)("li",{parentName:"ol"},"try a few queries :)")),Object(o.b)("h3",{id:"install-libraries-to-extract-text-from-pdfs"},"Install libraries to extract text from pdfs"),Object(o.b)("h3",{id:"extract-data-from-cv-and-put-it-into-elasticsearch"},"Extract data from CV and put it into Elasticsearch"),Object(o.b)("p",null,"I created a CV with ChatGPT ",Object(o.b)("a",{parentName:"p",href:"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/Liam_McGivney_CV.pdf"},"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/Liam_McGivney_CV.pdf")),Object(o.b)("p",null,"Install the library to extract text from the pdf\n",Object(o.b)("inlineCode",{parentName:"p"},"pip install PyMuPDF"),"\nI had to Shift+Command+p then python clear workspace cache and reload window. Then it saw it :/"),Object(o.b)("p",null,"The file cvpipeline.py has the python code for the indexing. It's not working right now though!\n",Object(o.b)("a",{parentName:"p",href:"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/cvpipeline.py"},"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/cvpipeline.py")),Object(o.b)("p",null,"I'll keep developing this and update it once it's working."),Object(o.b)("h2",{id:"free-data-engineering-course-with-aws-tdengine-docker-and-grafana"},"Free Data Engineering Course with AWS TDengine Docker and Grafana"),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"Free hands-on course:")," ",Object(o.b)("a",{parentName:"p",href:"https://youtu.be/eoj-CnrR9jA"},"Watch on YouTube")),Object(o.b)("p",null,"In this detailed tutorial video, Andreas guides viewers through creating an end-to-end data pipeline using time series data. The project focuses on fetching weather data from a Weather API, processing it on AWS, storing it in TDengine (a time series database), and visualizing the data with Grafana. Here's a concise summary of what the video covers:"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Introduction and Setup:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},"The project is introduced along with a GitHub repository containing all necessary resources and a step-by-step guide."),Object(o.b)("li",{parentName:"ul"},"The pipeline architecture includes an IoT weather station, a Weather API, AWS for processing, TDengine for data storage, and Grafana for visualization.")),Object(o.b)("ol",{start:2},Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Project Components:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Weather API:")," Utilizes weatherapi.com to fetch weather data."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"AWS Lambda:")," Processes the data fetched from the Weather API."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"TDengine:")," Serves as the time series database to store processed data. It's highlighted for its performance and simplicity, especially for time series data."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Grafana:")," Used for creating dashboards to visualize the time series data.")),Object(o.b)("ol",{start:3},Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Development and Deployment:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},"The local development environment setup includes Python, Docker, and VS Code."),Object(o.b)("li",{parentName:"ul"},"The tutorial covers the creation of a Docker image for the project and deploying it to AWS Elastic Container Registry (ECR)."),Object(o.b)("li",{parentName:"ul"},"AWS Lambda is then configured to use the Docker image from ECR."),Object(o.b)("li",{parentName:"ul"},"AWS EventBridge is used to schedule the Lambda function to run at specified intervals.")),Object(o.b)("ol",{start:4},Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Time Series Data:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},"The importance of time series data and the benefits of using a time series database like TDengine over traditional relational databases are discussed."),Object(o.b)("li",{parentName:"ul"},"TDengine's features such as speed, scaling, data retention, and built-in functions for time series data are highlighted.")),Object(o.b)("ol",{start:5},Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Building the Pipeline:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},"Detailed instructions are provided for setting up each component of the pipeline:",Object(o.b)("ul",{parentName:"li"},Object(o.b)("li",{parentName:"ul"},"Fetching weather data from the Weather API."),Object(o.b)("li",{parentName:"ul"},"Processing and sending the data to TDengine using an AWS Lambda function."),Object(o.b)("li",{parentName:"ul"},"Visualizing the data with Grafana."))),Object(o.b)("li",{parentName:"ul"},"Each step includes code snippets and configurations needed to implement the pipeline.")),Object(o.b)("ol",{start:6},Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Conclusion:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},"The video concludes with a demonstration of the completed pipeline, showing weather data visualizations in Grafana."),Object(o.b)("li",{parentName:"ul"},"Viewers are encouraged to replicate the project using the resources provided in the GitHub repository linked in the video description.")),Object(o.b)("p",null,"This video provides a comprehensive guide to building a data pipeline with a focus on time series data, demonstrating the integration of various technologies and platforms to achieve an end-to-end solution."),Object(o.b)("h2",{id:"monitor-your-data-in-dbt-and-detect-quality-issues-with-elementary"},"Monitor your data in dbt and detect quality issues with Elementary"),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"Free hands-on tutorial:")," ",Object(o.b)("a",{parentName:"p",href:"https://youtu.be/6fnU91Q2gq0"},"Watch on YouTube")),Object(o.b)("p",null,"In this comprehensive tutorial, Andreas delves into the integration of dbt (data build tool) with Elementary to enhance data monitoring and quality detection within Snowflake databases. The tutorial is structured to guide viewers through a hands-on experience, starting with an introduction to a sample project setup and the common challenges faced in monitoring dbt jobs. It then transitions into how Elementary can be utilized to address these challenges effectively."),Object(o.b)("p",null,"Key learning points and tutorial structure include:"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Introduction to the Sample Project:")," Andreas showcases a project setup involving Snowflake as the data warehouse, dbt for data modeling and testing, and a visualization tool for data analysis. This setup serves as the basis for the tutorial."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Challenges in Monitoring dbt Jobs:")," Common issues in monitoring dbt jobs are discussed, highlighting the limitations of the dbt interface in providing comprehensive monitoring capabilities."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Introduction to Elementary:")," Elementary is introduced as a dbt-native data observability tool designed to enhance the monitoring and analysis of dbt jobs. It offers both open-source and cloud versions, with the tutorial focusing on the cloud version."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Setup Requirements:")," The tutorial covers the necessary setup on both the Snowflake and dbt sides, including schema creation, user and role configuration in Snowflake, and modifications to the dbt project for integrating with Elementary."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Elementary's User Interface and Features:")," A thorough walkthrough of Elementary's interface is provided, showcasing its dashboard, test results, model runs, data catalog, and data lineage features. The tool's ability to automatically run additional tests, like anomaly detection and schema change detection, is also highlighted."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Advantages of Using Elementary:")," The presenter outlines several benefits of using Elementary, such as easy implementation, native test integration, clean and straightforward UI, and enhanced privacy due to data being stored within the user's data warehouse."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Potential Drawbacks:")," Some potential drawbacks are discussed, including the additional load on dbt job execution due to more models being run and limitations in dashboard customization."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Summary and Verdict:")," The tutorial concludes with a summary of the key features and benefits of using Elementary with dbt, emphasizing its value in improving data quality monitoring and detection.")),Object(o.b)("p",null,"Overall, viewers are guided through setting up and utilizing Elementary for dbt data monitoring, gaining insights into its capabilities, setup process, and the practical benefits it offers for data quality assurance."),Object(o.b)("h2",{id:"solving-engineers-4-biggest-airflow-problems"},"Solving Engineers 4 Biggest Airflow Problems"),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"Free hands-on tutorial:")," ",Object(o.b)("a",{parentName:"p",href:"https://youtu.be/b9bMNEh8bes"},"Watch on YouTube")),Object(o.b)("p",null,"In this informative video, Andreas discusses the four major challenges engineers face when working with Apache Airflow and introduces Astronomer, a managed Airflow service that addresses these issues effectively. Astronomer is highlighted as a solution that simplifies Airflow deployment and management, making it easier for engineers to develop, deploy, and monitor their data pipelines. Here's a summary of the key points discussed for each challenge and how Astronomer provides solutions:"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},"Managing Airflow Deployments:")),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Challenge:")," Setting up and maintaining Airflow deployments is complex and time-consuming, involving configuring cloud instances, managing resources, scaling, and updating the Airflow system."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Solution with Astronomer:")," Offers a straightforward deployment process where users can easily configure their deployments, choose cloud providers (GCP, AWS, Azure), and set up scaling with just a few clicks. Astronomer handles the complexity, making it easier to manage production and quality environments.")),Object(o.b)("ol",{start:2},Object(o.b)("li",{parentName:"ol"},"Development Environment and Deployment:")),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Challenge:")," Local installation of Airflow is complicated due to its dependency on multiple Docker containers and the need for extensive configuration."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Solution with Astronomer:")," Provides a CLI tool for setting up a local development environment with a single command, simplifying the process of developing, testing, and deploying pipelines. The Astronomer CLI also helps in initializing project templates and deploying Dags to the cloud effortlessly.")),Object(o.b)("ol",{start:3},Object(o.b)("li",{parentName:"ol"},"Source Code Management and CI/CD Pipelines:")),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Challenge:")," Collaborative development and continuous integration/deployment (CI/CD) are essential but challenging to implement effectively with Airflow alone."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Solution with Astronomer:")," Facilitates easy integration with GitHub for source code management and GitHub Actions for CI/CD. This allows automatic testing and deployment of pipeline code, ensuring a smooth workflow for teams working on pipeline development.")),Object(o.b)("ol",{start:4},Object(o.b)("li",{parentName:"ol"},"Observing Pipelines and Alarms:")),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Challenge:")," Monitoring data pipelines and getting timely alerts when issues occur is crucial but often difficult to achieve."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Solution with Astronomer:")," The Astronomer platform provides a user-friendly interface for monitoring pipeline status and performance. It also offers customizable alerts for failures or prolonged task durations, with notifications via email, PagerDuty, or Slack, ensuring immediate awareness and response to issues.")),Object(o.b)("p",null,"Overall, the video shows Astronomer as a powerful and user-friendly platform that addresses the common challenges of using Airflow, from deployment and development to collaboration, CI/CD, and monitoring. It suggests that Astronomer can significantly improve the experience of engineers working with Airflow, making it easier to manage, develop, and monitor data pipelines."),Object(o.b)("h2",{id:"the-best-alternative-to-airlfow-mageai"},"The best alternative to Airlfow? Mage.ai"),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"Free hands-on tutorial:")," ",Object(o.b)("a",{parentName:"p",href:"https://youtu.be/3gXsFEC3aYA"},"Watch on YouTube")),Object(o.b)("p",null,"In this insightful video, Andreas introduces Mage, a promising alternative to Apache Airflow, focusing on its simplicity, user-friendliness, and scalability. The video provides a comprehensive walkthrough of Mage, highlighting its key features and advantages over Airflow. Here's a breakdown of what viewers can learn and expect from the video:"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Deployment Ease:")," Mage offers a stark contrast to Airflow's complex setup process. It simplifies deployment to a single Docker image, making it straightforward to install and start on any machine, whether it's local or cloud-based on AWS, GCP, or Azure. This simplicity extends to scaling, which Mage handles horizontally, particularly beneficial in Kubernetes environments where performance scales with the number of pipelines."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"User Interface (UI):")," Mage shines with its UI, presenting a dark mode interface that's not only visually appealing but also simplifies navigation and pipeline management. The UI facilitates easy access to pipelines, scheduling, and monitoring of pipeline runs, offering a more intuitive experience compared to Airflow."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Pipeline Creation and Modification:")," Mage streamlines the creation of ETL pipelines, allowing users to easily add data loaders, transformers, and exporters through its UI. It supports direct interaction with APIs for data loading and provides a visual representation of the data flow, enhancing the overall pipeline design experience."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Data Visualization and Exploration:")," Beyond simple pipeline creation, Mage enables in-depth data exploration within the UI. Users can generate various charts, such as histograms and bar charts, to analyze the data directly, a feature that greatly enhances the tool's utility."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Testing and Scheduling:")," Testing pipelines in Mage is straightforward, allowing for quick integration of tests to ensure data quality and pipeline reliability. Scheduling is also versatile, supporting standard time-based triggers, event-based triggers for real-time data ingestion, and API calls for on-demand pipeline execution."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Support for Streaming and ELT Processes:")," Mage is not limited to ETL workflows but also supports streaming and ELT processes. It integrates seamlessly with DBT models for in-warehouse transformations and Spark for big data processing, showcasing its versatility and scalability."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Conclusion and Call to Action:")," Andreas concludes by praising the direction in which the industry is moving, with tools like Mage simplifying data engineering processes. He encourages viewers to try Mage and engage with the content by liking, subscribing, and commenting on their current tools and the potential impact of Mage.")),Object(o.b)("p",null,"Overall, the video shows Mage as a highly user-friendly, scalable, and versatile tool for data pipeline creation and management, offering a compelling alternative to traditional tools like Airflow."))}d.isMDXComponent=!0},88:function(e,t,a){"use strict";a.d(t,"a",(function(){return b})),a.d(t,"b",(function(){return m}));var n=a(0),i=a.n(n);function o(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function r(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,n)}return a}function l(e){for(var t=1;t=0||(i[a]=e[a]);return i}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(i[a]=e[a])}return i}var c=i.a.createContext({}),d=function(e){var t=i.a.useContext(c),a=t;return e&&(a="function"==typeof e?e(t):l(l({},t),e)),a},b=function(e){var t=d(e.components);return i.a.createElement(c.Provider,{value:t},e.children)},p={inlineCode:"code",wrapper:function(e){var t=e.children;return i.a.createElement(i.a.Fragment,{},t)}},u=i.a.forwardRef((function(e,t){var a=e.components,n=e.mdxType,o=e.originalType,r=e.parentName,c=s(e,["components","mdxType","originalType","parentName"]),b=d(a),u=n,m=b["".concat(r,".").concat(u)]||b[u]||p[u]||o;return a?i.a.createElement(m,l(l({ref:t},c),{},{components:a})):i.a.createElement(m,l({ref:t},c))}));function m(e,t){var a=arguments,n=t&&t.mdxType;if("string"==typeof e||n){var o=a.length,r=new Array(o);r[0]=u;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:n,r[1]=l;for(var c=2;c sends you to the browser")),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"For Windows Users"),"\nConfigure WSL2 to use max only 4GB of ram:"),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},'wsl --shutdown\nnotepad "$env:USERPROFILE/.wslconfig"\n')),Object(o.b)("p",null,".wslconfig file:"),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},"[wsl2]\nmemory=4GB # Limits VM memory in WSL 2 up to 4GB\n")),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"Modify the Linux kernel map count in WSL"),"\nDo this before the start because Elasticsearch requires a higher value to work\n",Object(o.b)("inlineCode",{parentName:"p"},"sudo sysctl -w vm.max_map_count=262144")),Object(o.b)("ol",{start:4},Object(o.b)("li",{parentName:"ol"},"go to the Elasticsearch-RAG folder and do ",Object(o.b)("inlineCode",{parentName:"li"},"docker compose up")),Object(o.b)("li",{parentName:"ol"},"make sure you have Elasticsearch 8.11 or later (we use 8.16 here in this project) if you want to use your own Elasticsearch image"),Object(o.b)("li",{parentName:"ol"},"if you get this error on a mac then just open the console in the docker app: ",Object(o.b)("em",{parentName:"li"},"error getting credentials - err: exec: docker-credential-desktop: executable file not found in $PATH, out:")),Object(o.b)("li",{parentName:"ol"},"Install xcode command line tools: ",Object(o.b)("inlineCode",{parentName:"li"},"xcode-select --install")),Object(o.b)("li",{parentName:"ol"},"make sure you're at python 3.8.1 or larger -> installed 3.13.0 from ",Object(o.b)("a",{parentName:"li",href:"https://www.python.org/downloads/"},"https://www.python.org/downloads/"))),Object(o.b)("h3",{id:"setup-the-virtual-python-environment"},"Setup the virtual Python environment"),Object(o.b)("h4",{id:"preparation-on-a-mac"},"preparation on a Mac"),Object(o.b)("h5",{id:"install-brew"},"install brew"),Object(o.b)("p",null,'which brew\n/bin/bash -c "$(curl -fsSL ',Object(o.b)("a",{parentName:"p",href:"https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)%22"},'https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"'),'\nexport PATH="/opt/homebrew/bin:$PATH"\nbrew --version\nbrew install pyenv\nbrew install pyenv-virtualenv'),Object(o.b)("h5",{id:"install-pyenv"},"install pyenv"),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},"brew install pyenv\nbrew install pyenv-virtualenv\n")),Object(o.b)("p",null,"Modify the path so that pyenv is in the path variable\n",Object(o.b)("inlineCode",{parentName:"p"},"nano ~/.zshrc")),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},'export PYENV_ROOT="$HOME/.pyenv"\nexport PATH="$PYENV_ROOT/bin:$PATH"\neval "$(pyenv init --path)"\neval "$(pyenv init -)"\neval "$(pyenv virtualenv-init -)"\n')),Object(o.b)("p",null,"install dependencies for building python versions\n",Object(o.b)("inlineCode",{parentName:"p"},"brew install openssl readline sqlite3 xz zlib")),Object(o.b)("p",null,"Reload to apply changes\n",Object(o.b)("inlineCode",{parentName:"p"},"source ~/.zshrc")),Object(o.b)("p",null,"install python"),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},"pyenv install 3.11.6\npyenv version\n")),Object(o.b)("p",null,"Set Python version system wide\n",Object(o.b)("inlineCode",{parentName:"p"},"pyenv global 3.11.6")),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},"pyenv virtualenv \npyenv activate \npyenv virtualenv-delete \n")),Object(o.b)("h4",{id:"windows-without-pyenv"},"Windows without pyenv"),Object(o.b)("p",null,"setup virtual python environment - go to the Elasticsearch-RAG folder and do\n",Object(o.b)("inlineCode",{parentName:"p"},"python3 -m venv .elkrag"),"\nenable the environment\n",Object(o.b)("inlineCode",{parentName:"p"},"source .elkrag/bin/activate")),Object(o.b)("h3",{id:"install-required-libraries-do-one-at-a-time-so-you-see-errors"},"Install required libraries (do one at a time so you see errors):"),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},"pip install llama-index (optional python3 -m pip install package name)\npip install llama-index-embeddings-ollama\npip install llama-index-llms-ollama\npip install llama-index-vector-stores-elasticsearch\npip install python-dotenv\n")),Object(o.b)("h3",{id:"write-the-data-to-elasticsearch"},"Write the data to Elasticsearch"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},"create / copy in the index.py file"),Object(o.b)("li",{parentName:"ol"},"download the conversations.json file from the folder code examples/GenAI-RAG"),Object(o.b)("li",{parentName:"ol"},"if you get an error with the execution then check if pedantic version is <2.0 ",Object(o.b)("inlineCode",{parentName:"li"},"pip show pydantic")," if not do this: ",Object(o.b)("inlineCode",{parentName:"li"},'pip install "pydantic<2.0')),Object(o.b)("li",{parentName:"ol"},"run the program index.py: ",Object(o.b)("a",{parentName:"li",href:"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/index.py"},"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/index.py"))),Object(o.b)("h3",{id:"check-the-data-in-elasticsearch"},"Check the data in Elasticsearch"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},"go to kibana http://localhost:5601/app/management/data/index_management/indices and see the new index called calls"),Object(o.b)("li",{parentName:"ol"},"go to dev tools and try out this query ",Object(o.b)("inlineCode",{parentName:"li"},"GET calls/_search?size=1 http://localhost:5601/app/dev_tools#/console/shell"))),Object(o.b)("h3",{id:"query-data-from-elasticsearch-and-create-an-output-with-mistral"},"Query data from elasticsearch and create an output with Mistral"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},"if everything is good then run the query.py file ",Object(o.b)("a",{parentName:"li",href:"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/query.py"},"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/query.py")),Object(o.b)("li",{parentName:"ol"},"try a few queries :)")),Object(o.b)("h3",{id:"install-libraries-to-extract-text-from-pdfs"},"Install libraries to extract text from pdfs"),Object(o.b)("h3",{id:"extract-data-from-cv-and-put-it-into-elasticsearch"},"Extract data from CV and put it into Elasticsearch"),Object(o.b)("p",null,"I created a CV with ChatGPT ",Object(o.b)("a",{parentName:"p",href:"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/Liam_McGivney_CV.pdf"},"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/Liam_McGivney_CV.pdf")),Object(o.b)("p",null,"Install the library to extract text from the pdf\n",Object(o.b)("inlineCode",{parentName:"p"},"pip install PyMuPDF"),"\nI had to Shift+Command+p then python clear workspace cache and reload window. Then it saw it :/"),Object(o.b)("p",null,"The file cvpipeline.py has the python code for the indexing. It's not working right now though!\n",Object(o.b)("a",{parentName:"p",href:"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/cvpipeline.py"},"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/cvpipeline.py")),Object(o.b)("p",null,"I'll keep developing this and update it once it's working."),Object(o.b)("h2",{id:"free-data-engineering-course-with-aws-tdengine-docker-and-grafana"},"Free Data Engineering Course with AWS TDengine Docker and Grafana"),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"Free hands-on course:")," ",Object(o.b)("a",{parentName:"p",href:"https://youtu.be/eoj-CnrR9jA"},"Watch on YouTube")),Object(o.b)("p",null,"In this detailed tutorial video, Andreas guides viewers through creating an end-to-end data pipeline using time series data. The project focuses on fetching weather data from a Weather API, processing it on AWS, storing it in TDengine (a time series database), and visualizing the data with Grafana. Here's a concise summary of what the video covers:"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Introduction and Setup:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},"The project is introduced along with a GitHub repository containing all necessary resources and a step-by-step guide."),Object(o.b)("li",{parentName:"ul"},"The pipeline architecture includes an IoT weather station, a Weather API, AWS for processing, TDengine for data storage, and Grafana for visualization.")),Object(o.b)("ol",{start:2},Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Project Components:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Weather API:")," Utilizes weatherapi.com to fetch weather data."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"AWS Lambda:")," Processes the data fetched from the Weather API."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"TDengine:")," Serves as the time series database to store processed data. It's highlighted for its performance and simplicity, especially for time series data."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Grafana:")," Used for creating dashboards to visualize the time series data.")),Object(o.b)("ol",{start:3},Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Development and Deployment:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},"The local development environment setup includes Python, Docker, and VS Code."),Object(o.b)("li",{parentName:"ul"},"The tutorial covers the creation of a Docker image for the project and deploying it to AWS Elastic Container Registry (ECR)."),Object(o.b)("li",{parentName:"ul"},"AWS Lambda is then configured to use the Docker image from ECR."),Object(o.b)("li",{parentName:"ul"},"AWS EventBridge is used to schedule the Lambda function to run at specified intervals.")),Object(o.b)("ol",{start:4},Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Time Series Data:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},"The importance of time series data and the benefits of using a time series database like TDengine over traditional relational databases are discussed."),Object(o.b)("li",{parentName:"ul"},"TDengine's features such as speed, scaling, data retention, and built-in functions for time series data are highlighted.")),Object(o.b)("ol",{start:5},Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Building the Pipeline:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},"Detailed instructions are provided for setting up each component of the pipeline:",Object(o.b)("ul",{parentName:"li"},Object(o.b)("li",{parentName:"ul"},"Fetching weather data from the Weather API."),Object(o.b)("li",{parentName:"ul"},"Processing and sending the data to TDengine using an AWS Lambda function."),Object(o.b)("li",{parentName:"ul"},"Visualizing the data with Grafana."))),Object(o.b)("li",{parentName:"ul"},"Each step includes code snippets and configurations needed to implement the pipeline.")),Object(o.b)("ol",{start:6},Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Conclusion:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},"The video concludes with a demonstration of the completed pipeline, showing weather data visualizations in Grafana."),Object(o.b)("li",{parentName:"ul"},"Viewers are encouraged to replicate the project using the resources provided in the GitHub repository linked in the video description.")),Object(o.b)("p",null,"This video provides a comprehensive guide to building a data pipeline with a focus on time series data, demonstrating the integration of various technologies and platforms to achieve an end-to-end solution."),Object(o.b)("h2",{id:"monitor-your-data-in-dbt-and-detect-quality-issues-with-elementary"},"Monitor your data in dbt and detect quality issues with Elementary"),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"Free hands-on tutorial:")," ",Object(o.b)("a",{parentName:"p",href:"https://youtu.be/6fnU91Q2gq0"},"Watch on YouTube")),Object(o.b)("p",null,"In this comprehensive tutorial, Andreas delves into the integration of dbt (data build tool) with Elementary to enhance data monitoring and quality detection within Snowflake databases. The tutorial is structured to guide viewers through a hands-on experience, starting with an introduction to a sample project setup and the common challenges faced in monitoring dbt jobs. It then transitions into how Elementary can be utilized to address these challenges effectively."),Object(o.b)("p",null,"Key learning points and tutorial structure include:"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Introduction to the Sample Project:")," Andreas showcases a project setup involving Snowflake as the data warehouse, dbt for data modeling and testing, and a visualization tool for data analysis. This setup serves as the basis for the tutorial."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Challenges in Monitoring dbt Jobs:")," Common issues in monitoring dbt jobs are discussed, highlighting the limitations of the dbt interface in providing comprehensive monitoring capabilities."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Introduction to Elementary:")," Elementary is introduced as a dbt-native data observability tool designed to enhance the monitoring and analysis of dbt jobs. It offers both open-source and cloud versions, with the tutorial focusing on the cloud version."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Setup Requirements:")," The tutorial covers the necessary setup on both the Snowflake and dbt sides, including schema creation, user and role configuration in Snowflake, and modifications to the dbt project for integrating with Elementary."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Elementary's User Interface and Features:")," A thorough walkthrough of Elementary's interface is provided, showcasing its dashboard, test results, model runs, data catalog, and data lineage features. The tool's ability to automatically run additional tests, like anomaly detection and schema change detection, is also highlighted."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Advantages of Using Elementary:")," The presenter outlines several benefits of using Elementary, such as easy implementation, native test integration, clean and straightforward UI, and enhanced privacy due to data being stored within the user's data warehouse."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Potential Drawbacks:")," Some potential drawbacks are discussed, including the additional load on dbt job execution due to more models being run and limitations in dashboard customization."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Summary and Verdict:")," The tutorial concludes with a summary of the key features and benefits of using Elementary with dbt, emphasizing its value in improving data quality monitoring and detection.")),Object(o.b)("p",null,"Overall, viewers are guided through setting up and utilizing Elementary for dbt data monitoring, gaining insights into its capabilities, setup process, and the practical benefits it offers for data quality assurance."),Object(o.b)("h2",{id:"solving-engineers-4-biggest-airflow-problems"},"Solving Engineers 4 Biggest Airflow Problems"),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"Free hands-on tutorial:")," ",Object(o.b)("a",{parentName:"p",href:"https://youtu.be/b9bMNEh8bes"},"Watch on YouTube")),Object(o.b)("p",null,"In this informative video, Andreas discusses the four major challenges engineers face when working with Apache Airflow and introduces Astronomer, a managed Airflow service that addresses these issues effectively. Astronomer is highlighted as a solution that simplifies Airflow deployment and management, making it easier for engineers to develop, deploy, and monitor their data pipelines. Here's a summary of the key points discussed for each challenge and how Astronomer provides solutions:"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},"Managing Airflow Deployments:")),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Challenge:")," Setting up and maintaining Airflow deployments is complex and time-consuming, involving configuring cloud instances, managing resources, scaling, and updating the Airflow system."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Solution with Astronomer:")," Offers a straightforward deployment process where users can easily configure their deployments, choose cloud providers (GCP, AWS, Azure), and set up scaling with just a few clicks. Astronomer handles the complexity, making it easier to manage production and quality environments.")),Object(o.b)("ol",{start:2},Object(o.b)("li",{parentName:"ol"},"Development Environment and Deployment:")),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Challenge:")," Local installation of Airflow is complicated due to its dependency on multiple Docker containers and the need for extensive configuration."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Solution with Astronomer:")," Provides a CLI tool for setting up a local development environment with a single command, simplifying the process of developing, testing, and deploying pipelines. The Astronomer CLI also helps in initializing project templates and deploying Dags to the cloud effortlessly.")),Object(o.b)("ol",{start:3},Object(o.b)("li",{parentName:"ol"},"Source Code Management and CI/CD Pipelines:")),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Challenge:")," Collaborative development and continuous integration/deployment (CI/CD) are essential but challenging to implement effectively with Airflow alone."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Solution with Astronomer:")," Facilitates easy integration with GitHub for source code management and GitHub Actions for CI/CD. This allows automatic testing and deployment of pipeline code, ensuring a smooth workflow for teams working on pipeline development.")),Object(o.b)("ol",{start:4},Object(o.b)("li",{parentName:"ol"},"Observing Pipelines and Alarms:")),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Challenge:")," Monitoring data pipelines and getting timely alerts when issues occur is crucial but often difficult to achieve."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Solution with Astronomer:")," The Astronomer platform provides a user-friendly interface for monitoring pipeline status and performance. It also offers customizable alerts for failures or prolonged task durations, with notifications via email, PagerDuty, or Slack, ensuring immediate awareness and response to issues.")),Object(o.b)("p",null,"Overall, the video shows Astronomer as a powerful and user-friendly platform that addresses the common challenges of using Airflow, from deployment and development to collaboration, CI/CD, and monitoring. It suggests that Astronomer can significantly improve the experience of engineers working with Airflow, making it easier to manage, develop, and monitor data pipelines."),Object(o.b)("h2",{id:"the-best-alternative-to-airlfow-mageai"},"The best alternative to Airlfow? Mage.ai"),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"Free hands-on tutorial:")," ",Object(o.b)("a",{parentName:"p",href:"https://youtu.be/3gXsFEC3aYA"},"Watch on YouTube")),Object(o.b)("p",null,"In this insightful video, Andreas introduces Mage, a promising alternative to Apache Airflow, focusing on its simplicity, user-friendliness, and scalability. The video provides a comprehensive walkthrough of Mage, highlighting its key features and advantages over Airflow. Here's a breakdown of what viewers can learn and expect from the video:"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Deployment Ease:")," Mage offers a stark contrast to Airflow's complex setup process. It simplifies deployment to a single Docker image, making it straightforward to install and start on any machine, whether it's local or cloud-based on AWS, GCP, or Azure. This simplicity extends to scaling, which Mage handles horizontally, particularly beneficial in Kubernetes environments where performance scales with the number of pipelines."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"User Interface (UI):")," Mage shines with its UI, presenting a dark mode interface that's not only visually appealing but also simplifies navigation and pipeline management. The UI facilitates easy access to pipelines, scheduling, and monitoring of pipeline runs, offering a more intuitive experience compared to Airflow."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Pipeline Creation and Modification:")," Mage streamlines the creation of ETL pipelines, allowing users to easily add data loaders, transformers, and exporters through its UI. It supports direct interaction with APIs for data loading and provides a visual representation of the data flow, enhancing the overall pipeline design experience."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Data Visualization and Exploration:")," Beyond simple pipeline creation, Mage enables in-depth data exploration within the UI. Users can generate various charts, such as histograms and bar charts, to analyze the data directly, a feature that greatly enhances the tool's utility."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Testing and Scheduling:")," Testing pipelines in Mage is straightforward, allowing for quick integration of tests to ensure data quality and pipeline reliability. Scheduling is also versatile, supporting standard time-based triggers, event-based triggers for real-time data ingestion, and API calls for on-demand pipeline execution."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Support for Streaming and ELT Processes:")," Mage is not limited to ETL workflows but also supports streaming and ELT processes. It integrates seamlessly with DBT models for in-warehouse transformations and Spark for big data processing, showcasing its versatility and scalability."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Conclusion and Call to Action:")," Andreas concludes by praising the direction in which the industry is moving, with tools like Mage simplifying data engineering processes. He encourages viewers to try Mage and engage with the content by liking, subscribing, and commenting on their current tools and the potential impact of Mage.")),Object(o.b)("p",null,"Overall, the video shows Mage as a highly user-friendly, scalable, and versatile tool for data pipeline creation and management, offering a compelling alternative to traditional tools like Airflow."))}d.isMDXComponent=!0},88:function(e,t,a){"use strict";a.d(t,"a",(function(){return b})),a.d(t,"b",(function(){return m}));var n=a(0),i=a.n(n);function o(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function r(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,n)}return a}function l(e){for(var t=1;t=0||(i[a]=e[a]);return i}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(i[a]=e[a])}return i}var c=i.a.createContext({}),d=function(e){var t=i.a.useContext(c),a=t;return e&&(a="function"==typeof e?e(t):l(l({},t),e)),a},b=function(e){var t=d(e.components);return i.a.createElement(c.Provider,{value:t},e.children)},p={inlineCode:"code",wrapper:function(e){var t=e.children;return i.a.createElement(i.a.Fragment,{},t)}},u=i.a.forwardRef((function(e,t){var a=e.components,n=e.mdxType,o=e.originalType,r=e.parentName,c=s(e,["components","mdxType","originalType","parentName"]),b=d(a),u=n,m=b["".concat(r,".").concat(u)]||b[u]||p[u]||o;return a?i.a.createElement(m,l(l({ref:t},c),{},{components:a})):i.a.createElement(m,l({ref:t},c))}));function m(e,t){var a=arguments,n=t&&t.mdxType;if("string"==typeof e||n){var o=a.length,r=new Array(o);r[0]=u;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:n,r[1]=l;for(var c=2;c 01-Introduction | THE DATA ENGINEERING COOKBOOK - + @@ -138,7 +138,7 @@ | In part 3 I focused on everything regarding Leadership and Communication: team management, project management, collaboration, problem solving, strategic thinking, communication and leadership | Watch on YouTube|

Watch on YouTube

Final Thoughts#

The path to becoming a senior data engineer is both challenging and rewarding. It requires a blend of technical prowess, continuous learning, and the development of soft skills that enable you to lead and innovate. Whether you're just starting out or looking to advance your career, focusing on the key areas outlined above will set you on the right path.

- + diff --git a/docs/02-BasicSkills/index.html b/docs/02-BasicSkills/index.html index 1954c1f..fca38cf 100644 --- a/docs/02-BasicSkills/index.html +++ b/docs/02-BasicSkills/index.html @@ -7,7 +7,7 @@ 02-BasicSkills | THE DATA ENGINEERING COOKBOOK - + @@ -199,7 +199,7 @@ a IoT device, logs, or whatever.

A data catalog is also important. It explains which features are available and how different data sets are labeled.

All different types of data. Now, here comes the engineering part.

The Data Engineers part, is making this data available. Available to the data scientist and the machine learning process.

So when you look at the model, on the left side you have your hyper parameter configuration. You need to store and manage these configurations somehow.

Then you have the actual training data.

There's a lot going on with the training data:

Where does it come from? Who owns it? Which is basically data governance.

What's the lineage? Have you modified this data? What did you do, what was the basis, the raw data?

You need to access all this data somehow. In training and in production.

In production you need to have access to the live data.

All this is the data engineers job. Making the data available.

First an architect needs to build the platform. This can also be a good data engineer.

Then the data engineer needs to build the pipelines. How is the data coming in and how is the platform connecting to other systems.

How is that data then put into the storage. Is there a pre processing for the algorithms necessary? He'll do it.

Once the data and the systems are available, it's time for the machine learning part.

It is ready for processing. Basically ready for the data scientist.

Once the analytics is done the data engineer needs to build pipelines to make it then accessible again. For instance for other analytics processes, for APIs, for front ends and so on.

All in all, the data engineer's part is a computer science part.

That's why I love it so much :)

- + diff --git a/docs/03-AdvancedSkills/index.html b/docs/03-AdvancedSkills/index.html index 09c93ad..ce1d816 100644 --- a/docs/03-AdvancedSkills/index.html +++ b/docs/03-AdvancedSkills/index.html @@ -7,7 +7,7 @@ 03-AdvancedSkills | THE DATA ENGINEERING COOKBOOK - + @@ -436,7 +436,7 @@ already there.

They just need to show us that the algorithms work. The end.

AWS Sagemaker#

Train and apply models online with Sagemaker

Link to the OLX Slideshare with pros, cons and how to use Sagemaker: https://www.slideshare.net/mobile/AlexeyGrigorev/image-models-infrastructure-at-olx

- + diff --git a/docs/04-HandsOnCourse/index.html b/docs/04-HandsOnCourse/index.html index 123a5f3..172afcf 100644 --- a/docs/04-HandsOnCourse/index.html +++ b/docs/04-HandsOnCourse/index.html @@ -7,7 +7,7 @@ 04-HandsOnCourse | THE DATA ENGINEERING COOKBOOK - + @@ -15,11 +15,11 @@ - +
-

04-HandsOnCourse

Data Engineering Course: Building A Data Platform#

Contents#

GenAI Retrieval Augmented Generation with Ollama and ElasticSearch#

  • This how-to is based on this one from Elasticsearch: https://www.elastic.co/search-labs/blog/rag-with-llamaIndex-and-elasticsearch
  • Instead of Elasticsearch cloud we're going to run everything locally
  • The simplest way to get this done is to just clone this GitHub Repo for the code and docker setup
  • I've tried this on a M1 Mac. Changes for Windows with WSL will come later.
  • The biggest problems that I had were actually installing the dependencies rather than the code itself.

Install Ollama#

  1. Download Ollama from here https://ollama.com/download/mac
  2. Unzip, drag into applications and install
  3. do ollama run mistral (It's going to download the Mistral 7b model, 4.1GB size)
  4. Create a new folder in Documents "Elasticsearch-RAG"
  5. Open that folder in VSCode

Install Elasticsearch & Kibana (Docker)#

  1. Use the docker-compose file from this repo: https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/docker-compose.yml
  2. Download Docker Desktop from here: https://www.docker.com/products/docker-desktop/
  3. Install docker desktop and sign in in the app/create a user -> sends you to the browser

For Windows Users +

04-HandsOnCourse

Data Engineering Course: Building A Data Platform#

Contents#

GenAI Retrieval Augmented Generation with Ollama and Elasticsearch#

  • This how-to is based on this one from Elasticsearch: https://www.elastic.co/search-labs/blog/rag-with-llamaIndex-and-elasticsearch
  • Instead of Elasticsearch cloud we're going to run everything locally
  • The simplest way to get this done is to just clone this GitHub Repo for the code and docker setup
  • I've tried this on a M1 Mac. Changes for Windows with WSL will come later.
  • The biggest problems that I had were actually installing the dependencies rather than the code itself.

Install Ollama#

  1. Download Ollama from here https://ollama.com/download/mac
  2. Unzip, drag into applications and install
  3. do ollama run mistral (It's going to download the Mistral 7b model, 4.1GB size)
  4. Create a new folder in Documents "Elasticsearch-RAG"
  5. Open that folder in VSCode

Install Elasticsearch & Kibana (Docker)#

  1. Use the docker-compose file from this repo: https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/docker-compose.yml
  2. Download Docker Desktop from here: https://www.docker.com/products/docker-desktop/
  3. Install docker desktop and sign in in the app/create a user -> sends you to the browser

For Windows Users Configure WSL2 to use max only 4GB of ram:

wsl --shutdown
notepad "$env:USERPROFILE/.wslconfig"

.wslconfig file:

[wsl2]
memory=4GB # Limits VM memory in WSL 2 up to 4GB

Modify the Linux kernel map count in WSL Do this before the start because Elasticsearch requires a higher value to work sudo sysctl -w vm.max_map_count=262144

  1. go to the Elasticsearch-RAG folder and do docker compose up
  2. make sure you have Elasticsearch 8.11 or later (we use 8.16 here in this project) if you want to use your own Elasticsearch image
  3. if you get this error on a mac then just open the console in the docker app: error getting credentials - err: exec: docker-credential-desktop: executable file not found in $PATH, out:
  4. Install xcode command line tools: xcode-select --install
  5. make sure you're at python 3.8.1 or larger -> installed 3.13.0 from https://www.python.org/downloads/

Setup the virtual Python environment#

preparation on a Mac#

install brew#

which brew @@ -37,9 +37,9 @@ source .elkrag/bin/activate

Install required libraries (do one at a time so you see errors):#

pip install llama-index (optional python3 -m pip install package name)
pip install llama-index-embeddings-ollama
pip install llama-index-llms-ollama
pip install llama-index-vector-stores-elasticsearch
pip install python-dotenv

Write the data to Elasticsearch#

  1. create / copy in the index.py file
  2. download the conversations.json file from the folder code examples/GenAI-RAG
  3. if you get an error with the execution then check if pedantic version is <2.0 pip show pydantic if not do this: pip install "pydantic<2.0
  4. run the program index.py: https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/index.py

Check the data in Elasticsearch#

  1. go to kibana http://localhost:5601/app/management/data/index_management/indices and see the new index called calls
  2. go to dev tools and try out this query GET calls/_search?size=1 http://localhost:5601/app/dev_tools#/console/shell

Query data from elasticsearch and create an output with Mistral#

  1. if everything is good then run the query.py file https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/query.py
  2. try a few queries :)

Install libraries to extract text from pdfs#

Extract data from CV and put it into Elasticsearch#

I created a CV with ChatGPT https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/Liam_McGivney_CV.pdf

Install the library to extract text from the pdf pip install PyMuPDF I had to Shift+Command+p then python clear workspace cache and reload window. Then it saw it :/

The file cvpipeline.py has the python code for the indexing. It's not working right now though! -https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/cvpipeline.py

I'll keep developing this and update it once it's working.

Free Data Engineering Course with AWS TDengine Docker and Grafana#

Free hands-on course: Watch on YouTube

In this detailed tutorial video, Andreas guides viewers through creating an end-to-end data pipeline using time series data. The project focuses on fetching weather data from a Weather API, processing it on AWS, storing it in TDengine (a time series database), and visualizing the data with Grafana. Here's a concise summary of what the video covers:

  1. Introduction and Setup:
  • The project is introduced along with a GitHub repository containing all necessary resources and a step-by-step guide.
  • The pipeline architecture includes an IoT weather station, a Weather API, AWS for processing, TDengine for data storage, and Grafana for visualization.
  1. Project Components:
  • Weather API: Utilizes weatherapi.com to fetch weather data.
  • AWS Lambda: Processes the data fetched from the Weather API.
  • TDengine: Serves as the time series database to store processed data. It's highlighted for its performance and simplicity, especially for time series data.
  • Grafana: Used for creating dashboards to visualize the time series data.
  1. Development and Deployment:
  • The local development environment setup includes Python, Docker, and VS Code.
  • The tutorial covers the creation of a Docker image for the project and deploying it to AWS Elastic Container Registry (ECR).
  • AWS Lambda is then configured to use the Docker image from ECR.
  • AWS EventBridge is used to schedule the Lambda function to run at specified intervals.
  1. Time Series Data:
  • The importance of time series data and the benefits of using a time series database like TDengine over traditional relational databases are discussed.
  • TDengine's features such as speed, scaling, data retention, and built-in functions for time series data are highlighted.
  1. Building the Pipeline:
  • Detailed instructions are provided for setting up each component of the pipeline:
    • Fetching weather data from the Weather API.
    • Processing and sending the data to TDengine using an AWS Lambda function.
    • Visualizing the data with Grafana.
  • Each step includes code snippets and configurations needed to implement the pipeline.
  1. Conclusion:
  • The video concludes with a demonstration of the completed pipeline, showing weather data visualizations in Grafana.
  • Viewers are encouraged to replicate the project using the resources provided in the GitHub repository linked in the video description.

This video provides a comprehensive guide to building a data pipeline with a focus on time series data, demonstrating the integration of various technologies and platforms to achieve an end-to-end solution.

Monitor your data in dbt and detect quality issues with Elementary#

Free hands-on tutorial: Watch on YouTube

In this comprehensive tutorial, Andreas delves into the integration of dbt (data build tool) with Elementary to enhance data monitoring and quality detection within Snowflake databases. The tutorial is structured to guide viewers through a hands-on experience, starting with an introduction to a sample project setup and the common challenges faced in monitoring dbt jobs. It then transitions into how Elementary can be utilized to address these challenges effectively.

Key learning points and tutorial structure include:

  1. Introduction to the Sample Project: Andreas showcases a project setup involving Snowflake as the data warehouse, dbt for data modeling and testing, and a visualization tool for data analysis. This setup serves as the basis for the tutorial.
  2. Challenges in Monitoring dbt Jobs: Common issues in monitoring dbt jobs are discussed, highlighting the limitations of the dbt interface in providing comprehensive monitoring capabilities.
  3. Introduction to Elementary: Elementary is introduced as a dbt-native data observability tool designed to enhance the monitoring and analysis of dbt jobs. It offers both open-source and cloud versions, with the tutorial focusing on the cloud version.
  4. Setup Requirements: The tutorial covers the necessary setup on both the Snowflake and dbt sides, including schema creation, user and role configuration in Snowflake, and modifications to the dbt project for integrating with Elementary.
  5. Elementary's User Interface and Features: A thorough walkthrough of Elementary's interface is provided, showcasing its dashboard, test results, model runs, data catalog, and data lineage features. The tool's ability to automatically run additional tests, like anomaly detection and schema change detection, is also highlighted.
  6. Advantages of Using Elementary: The presenter outlines several benefits of using Elementary, such as easy implementation, native test integration, clean and straightforward UI, and enhanced privacy due to data being stored within the user's data warehouse.
  7. Potential Drawbacks: Some potential drawbacks are discussed, including the additional load on dbt job execution due to more models being run and limitations in dashboard customization.
  8. Summary and Verdict: The tutorial concludes with a summary of the key features and benefits of using Elementary with dbt, emphasizing its value in improving data quality monitoring and detection.

Overall, viewers are guided through setting up and utilizing Elementary for dbt data monitoring, gaining insights into its capabilities, setup process, and the practical benefits it offers for data quality assurance.

Solving Engineers 4 Biggest Airflow Problems#

Free hands-on tutorial: Watch on YouTube

In this informative video, Andreas discusses the four major challenges engineers face when working with Apache Airflow and introduces Astronomer, a managed Airflow service that addresses these issues effectively. Astronomer is highlighted as a solution that simplifies Airflow deployment and management, making it easier for engineers to develop, deploy, and monitor their data pipelines. Here's a summary of the key points discussed for each challenge and how Astronomer provides solutions:

  1. Managing Airflow Deployments:
  • Challenge: Setting up and maintaining Airflow deployments is complex and time-consuming, involving configuring cloud instances, managing resources, scaling, and updating the Airflow system.
  • Solution with Astronomer: Offers a straightforward deployment process where users can easily configure their deployments, choose cloud providers (GCP, AWS, Azure), and set up scaling with just a few clicks. Astronomer handles the complexity, making it easier to manage production and quality environments.
  1. Development Environment and Deployment:
  • Challenge: Local installation of Airflow is complicated due to its dependency on multiple Docker containers and the need for extensive configuration.
  • Solution with Astronomer: Provides a CLI tool for setting up a local development environment with a single command, simplifying the process of developing, testing, and deploying pipelines. The Astronomer CLI also helps in initializing project templates and deploying Dags to the cloud effortlessly.
  1. Source Code Management and CI/CD Pipelines:
  • Challenge: Collaborative development and continuous integration/deployment (CI/CD) are essential but challenging to implement effectively with Airflow alone.
  • Solution with Astronomer: Facilitates easy integration with GitHub for source code management and GitHub Actions for CI/CD. This allows automatic testing and deployment of pipeline code, ensuring a smooth workflow for teams working on pipeline development.
  1. Observing Pipelines and Alarms:
  • Challenge: Monitoring data pipelines and getting timely alerts when issues occur is crucial but often difficult to achieve.
  • Solution with Astronomer: The Astronomer platform provides a user-friendly interface for monitoring pipeline status and performance. It also offers customizable alerts for failures or prolonged task durations, with notifications via email, PagerDuty, or Slack, ensuring immediate awareness and response to issues.

Overall, the video shows Astronomer as a powerful and user-friendly platform that addresses the common challenges of using Airflow, from deployment and development to collaboration, CI/CD, and monitoring. It suggests that Astronomer can significantly improve the experience of engineers working with Airflow, making it easier to manage, develop, and monitor data pipelines.

The best alternative to Airlfow? Mage.ai#

Free hands-on tutorial: Watch on YouTube

In this insightful video, Andreas introduces Mage, a promising alternative to Apache Airflow, focusing on its simplicity, user-friendliness, and scalability. The video provides a comprehensive walkthrough of Mage, highlighting its key features and advantages over Airflow. Here's a breakdown of what viewers can learn and expect from the video:

  1. Deployment Ease: Mage offers a stark contrast to Airflow's complex setup process. It simplifies deployment to a single Docker image, making it straightforward to install and start on any machine, whether it's local or cloud-based on AWS, GCP, or Azure. This simplicity extends to scaling, which Mage handles horizontally, particularly beneficial in Kubernetes environments where performance scales with the number of pipelines.
  2. User Interface (UI): Mage shines with its UI, presenting a dark mode interface that's not only visually appealing but also simplifies navigation and pipeline management. The UI facilitates easy access to pipelines, scheduling, and monitoring of pipeline runs, offering a more intuitive experience compared to Airflow.
  3. Pipeline Creation and Modification: Mage streamlines the creation of ETL pipelines, allowing users to easily add data loaders, transformers, and exporters through its UI. It supports direct interaction with APIs for data loading and provides a visual representation of the data flow, enhancing the overall pipeline design experience.
  4. Data Visualization and Exploration: Beyond simple pipeline creation, Mage enables in-depth data exploration within the UI. Users can generate various charts, such as histograms and bar charts, to analyze the data directly, a feature that greatly enhances the tool's utility.
  5. Testing and Scheduling: Testing pipelines in Mage is straightforward, allowing for quick integration of tests to ensure data quality and pipeline reliability. Scheduling is also versatile, supporting standard time-based triggers, event-based triggers for real-time data ingestion, and API calls for on-demand pipeline execution.
  6. Support for Streaming and ELT Processes: Mage is not limited to ETL workflows but also supports streaming and ELT processes. It integrates seamlessly with DBT models for in-warehouse transformations and Spark for big data processing, showcasing its versatility and scalability.
  7. Conclusion and Call to Action: Andreas concludes by praising the direction in which the industry is moving, with tools like Mage simplifying data engineering processes. He encourages viewers to try Mage and engage with the content by liking, subscribing, and commenting on their current tools and the potential impact of Mage.

Overall, the video shows Mage as a highly user-friendly, scalable, and versatile tool for data pipeline creation and management, offering a compelling alternative to traditional tools like Airflow.

+https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/cvpipeline.py

I'll keep developing this and update it once it's working.

Free Data Engineering Course with AWS TDengine Docker and Grafana#

Free hands-on course: Watch on YouTube

In this detailed tutorial video, Andreas guides viewers through creating an end-to-end data pipeline using time series data. The project focuses on fetching weather data from a Weather API, processing it on AWS, storing it in TDengine (a time series database), and visualizing the data with Grafana. Here's a concise summary of what the video covers:

  1. Introduction and Setup:
  • The project is introduced along with a GitHub repository containing all necessary resources and a step-by-step guide.
  • The pipeline architecture includes an IoT weather station, a Weather API, AWS for processing, TDengine for data storage, and Grafana for visualization.
  1. Project Components:
  • Weather API: Utilizes weatherapi.com to fetch weather data.
  • AWS Lambda: Processes the data fetched from the Weather API.
  • TDengine: Serves as the time series database to store processed data. It's highlighted for its performance and simplicity, especially for time series data.
  • Grafana: Used for creating dashboards to visualize the time series data.
  1. Development and Deployment:
  • The local development environment setup includes Python, Docker, and VS Code.
  • The tutorial covers the creation of a Docker image for the project and deploying it to AWS Elastic Container Registry (ECR).
  • AWS Lambda is then configured to use the Docker image from ECR.
  • AWS EventBridge is used to schedule the Lambda function to run at specified intervals.
  1. Time Series Data:
  • The importance of time series data and the benefits of using a time series database like TDengine over traditional relational databases are discussed.
  • TDengine's features such as speed, scaling, data retention, and built-in functions for time series data are highlighted.
  1. Building the Pipeline:
  • Detailed instructions are provided for setting up each component of the pipeline:
    • Fetching weather data from the Weather API.
    • Processing and sending the data to TDengine using an AWS Lambda function.
    • Visualizing the data with Grafana.
  • Each step includes code snippets and configurations needed to implement the pipeline.
  1. Conclusion:
  • The video concludes with a demonstration of the completed pipeline, showing weather data visualizations in Grafana.
  • Viewers are encouraged to replicate the project using the resources provided in the GitHub repository linked in the video description.

This video provides a comprehensive guide to building a data pipeline with a focus on time series data, demonstrating the integration of various technologies and platforms to achieve an end-to-end solution.

Monitor your data in dbt and detect quality issues with Elementary#

Free hands-on tutorial: Watch on YouTube

In this comprehensive tutorial, Andreas delves into the integration of dbt (data build tool) with Elementary to enhance data monitoring and quality detection within Snowflake databases. The tutorial is structured to guide viewers through a hands-on experience, starting with an introduction to a sample project setup and the common challenges faced in monitoring dbt jobs. It then transitions into how Elementary can be utilized to address these challenges effectively.

Key learning points and tutorial structure include:

  1. Introduction to the Sample Project: Andreas showcases a project setup involving Snowflake as the data warehouse, dbt for data modeling and testing, and a visualization tool for data analysis. This setup serves as the basis for the tutorial.
  2. Challenges in Monitoring dbt Jobs: Common issues in monitoring dbt jobs are discussed, highlighting the limitations of the dbt interface in providing comprehensive monitoring capabilities.
  3. Introduction to Elementary: Elementary is introduced as a dbt-native data observability tool designed to enhance the monitoring and analysis of dbt jobs. It offers both open-source and cloud versions, with the tutorial focusing on the cloud version.
  4. Setup Requirements: The tutorial covers the necessary setup on both the Snowflake and dbt sides, including schema creation, user and role configuration in Snowflake, and modifications to the dbt project for integrating with Elementary.
  5. Elementary's User Interface and Features: A thorough walkthrough of Elementary's interface is provided, showcasing its dashboard, test results, model runs, data catalog, and data lineage features. The tool's ability to automatically run additional tests, like anomaly detection and schema change detection, is also highlighted.
  6. Advantages of Using Elementary: The presenter outlines several benefits of using Elementary, such as easy implementation, native test integration, clean and straightforward UI, and enhanced privacy due to data being stored within the user's data warehouse.
  7. Potential Drawbacks: Some potential drawbacks are discussed, including the additional load on dbt job execution due to more models being run and limitations in dashboard customization.
  8. Summary and Verdict: The tutorial concludes with a summary of the key features and benefits of using Elementary with dbt, emphasizing its value in improving data quality monitoring and detection.

Overall, viewers are guided through setting up and utilizing Elementary for dbt data monitoring, gaining insights into its capabilities, setup process, and the practical benefits it offers for data quality assurance.

Solving Engineers 4 Biggest Airflow Problems#

Free hands-on tutorial: Watch on YouTube

In this informative video, Andreas discusses the four major challenges engineers face when working with Apache Airflow and introduces Astronomer, a managed Airflow service that addresses these issues effectively. Astronomer is highlighted as a solution that simplifies Airflow deployment and management, making it easier for engineers to develop, deploy, and monitor their data pipelines. Here's a summary of the key points discussed for each challenge and how Astronomer provides solutions:

  1. Managing Airflow Deployments:
  • Challenge: Setting up and maintaining Airflow deployments is complex and time-consuming, involving configuring cloud instances, managing resources, scaling, and updating the Airflow system.
  • Solution with Astronomer: Offers a straightforward deployment process where users can easily configure their deployments, choose cloud providers (GCP, AWS, Azure), and set up scaling with just a few clicks. Astronomer handles the complexity, making it easier to manage production and quality environments.
  1. Development Environment and Deployment:
  • Challenge: Local installation of Airflow is complicated due to its dependency on multiple Docker containers and the need for extensive configuration.
  • Solution with Astronomer: Provides a CLI tool for setting up a local development environment with a single command, simplifying the process of developing, testing, and deploying pipelines. The Astronomer CLI also helps in initializing project templates and deploying Dags to the cloud effortlessly.
  1. Source Code Management and CI/CD Pipelines:
  • Challenge: Collaborative development and continuous integration/deployment (CI/CD) are essential but challenging to implement effectively with Airflow alone.
  • Solution with Astronomer: Facilitates easy integration with GitHub for source code management and GitHub Actions for CI/CD. This allows automatic testing and deployment of pipeline code, ensuring a smooth workflow for teams working on pipeline development.
  1. Observing Pipelines and Alarms:
  • Challenge: Monitoring data pipelines and getting timely alerts when issues occur is crucial but often difficult to achieve.
  • Solution with Astronomer: The Astronomer platform provides a user-friendly interface for monitoring pipeline status and performance. It also offers customizable alerts for failures or prolonged task durations, with notifications via email, PagerDuty, or Slack, ensuring immediate awareness and response to issues.

Overall, the video shows Astronomer as a powerful and user-friendly platform that addresses the common challenges of using Airflow, from deployment and development to collaboration, CI/CD, and monitoring. It suggests that Astronomer can significantly improve the experience of engineers working with Airflow, making it easier to manage, develop, and monitor data pipelines.

The best alternative to Airlfow? Mage.ai#

Free hands-on tutorial: Watch on YouTube

In this insightful video, Andreas introduces Mage, a promising alternative to Apache Airflow, focusing on its simplicity, user-friendliness, and scalability. The video provides a comprehensive walkthrough of Mage, highlighting its key features and advantages over Airflow. Here's a breakdown of what viewers can learn and expect from the video:

  1. Deployment Ease: Mage offers a stark contrast to Airflow's complex setup process. It simplifies deployment to a single Docker image, making it straightforward to install and start on any machine, whether it's local or cloud-based on AWS, GCP, or Azure. This simplicity extends to scaling, which Mage handles horizontally, particularly beneficial in Kubernetes environments where performance scales with the number of pipelines.
  2. User Interface (UI): Mage shines with its UI, presenting a dark mode interface that's not only visually appealing but also simplifies navigation and pipeline management. The UI facilitates easy access to pipelines, scheduling, and monitoring of pipeline runs, offering a more intuitive experience compared to Airflow.
  3. Pipeline Creation and Modification: Mage streamlines the creation of ETL pipelines, allowing users to easily add data loaders, transformers, and exporters through its UI. It supports direct interaction with APIs for data loading and provides a visual representation of the data flow, enhancing the overall pipeline design experience.
  4. Data Visualization and Exploration: Beyond simple pipeline creation, Mage enables in-depth data exploration within the UI. Users can generate various charts, such as histograms and bar charts, to analyze the data directly, a feature that greatly enhances the tool's utility.
  5. Testing and Scheduling: Testing pipelines in Mage is straightforward, allowing for quick integration of tests to ensure data quality and pipeline reliability. Scheduling is also versatile, supporting standard time-based triggers, event-based triggers for real-time data ingestion, and API calls for on-demand pipeline execution.
  6. Support for Streaming and ELT Processes: Mage is not limited to ETL workflows but also supports streaming and ELT processes. It integrates seamlessly with DBT models for in-warehouse transformations and Spark for big data processing, showcasing its versatility and scalability.
  7. Conclusion and Call to Action: Andreas concludes by praising the direction in which the industry is moving, with tools like Mage simplifying data engineering processes. He encourages viewers to try Mage and engage with the content by liking, subscribing, and commenting on their current tools and the potential impact of Mage.

Overall, the video shows Mage as a highly user-friendly, scalable, and versatile tool for data pipeline creation and management, offering a compelling alternative to traditional tools like Airflow.

- + @@ -47,6 +47,6 @@ - + \ No newline at end of file diff --git a/docs/05-CaseStudies/index.html b/docs/05-CaseStudies/index.html index f733273..597a99f 100644 --- a/docs/05-CaseStudies/index.html +++ b/docs/05-CaseStudies/index.html @@ -7,7 +7,7 @@ 05-CaseStudies | THE DATA ENGINEERING COOKBOOK - + @@ -131,7 +131,7 @@ https://databricks.com/session/continuous-applications-at-scale-of-100-teams-with-databricks-delta-and-structured-streaming

Talk at Strata London slides: https://databricks.com/session/continuous-applications-at-scale-of-100-teams-with-databricks-delta-and-structured-streaming

https://jobs.zalando.com/tech/blog/what-is-hardcore-data-science--in-practice/?gh_src=4n3gxh1

https://jobs.zalando.com/tech/blog/complex-event-generation-for-business-process-monitoring-using-apache-flink/

- + diff --git a/docs/06-BestPracticesCloud/index.html b/docs/06-BestPracticesCloud/index.html index 4c40c5e..37ab405 100644 --- a/docs/06-BestPracticesCloud/index.html +++ b/docs/06-BestPracticesCloud/index.html @@ -7,7 +7,7 @@ 06-BestPracticesCloud | THE DATA ENGINEERING COOKBOOK - + @@ -26,7 +26,7 @@ They are also useful for AWS and GCP, just try to change out the tools.

As always, I am going to add more stuff to this over time.

Have fun!

Contents#

AWS#

Connect#

  • Elastic Beanstalk (very old)
  • SES Simple Email Service
  • API Gateway

Buffer#

  • Kinesis
  • Kinesis Data Firehose
  • Managed Streaming for Kafka (MSK)
  • MQ
  • Simple Queue Service (SQS)
  • Simple Notification Service (SNS)

Processing#

  • EC2
  • Athena
  • EMR
  • Elasticsearch
  • Kinesis Data Analytics
  • Glue
  • Step Functions
  • Fargate
  • Lambda
  • SageMaker

Store#

  • Simple Storage Service (S3)
  • Redshift
  • Aurora
  • RDS
  • DynamoDB
  • ElastiCache
  • Neptune Graph DB
  • Timestream
  • DocumentDB (MongoDB compatible)

Visualize#

  • Quicksight

Containerization#

  • Elastic Container Service (ECS)
  • Elastic Container Registry (ECR)
  • Elastic Kubernetes Service (EKS)

Best Practices#

Deploying a Spring Boot Application on AWS Using AWS Elastic Beanstalk:

https://aws.amazon.com/de/blogs/devops/deploying-a-spring-boot-application-on-aws-using-aws-elastic-beanstalk/

How to deploy a Docker Container on AWS:

https://aws.amazon.com/getting-started/hands-on/deploy-docker-containers/

AWS platform architecture for GenAI#

ImagetitleClick here to watch

I recorded a reaction video to an AWS platform architecture for GenAI called Tailwinds. Presented by John from Innovative Solutions and Josh from AWS, it has two main flows: indexing and consumer.

Data enters through S3 buckets or an API gateway, processed by AWS Lambda or Glue, and stored in a vector or graph database, then indexed in OpenSearch. Applications like chatbots use an API gateway to trigger Lambda functions for data retrieval and processing. This flexible serverless setup supports various data formats and uses tools like SAM and Terraform.

Amazon Bedrock helps customers choose and evaluate models. The architecture is flexible but requires effort to create the necessary Lambda functions. Check out the video and share your thoughts!

Click here to watch

Generative AI enabled job search engine#

Imagetitle

Click here to watch

Hey everyone, I recorded a reaction video to an AWS platform architecture for a Gen AI job search engine. Presented by Andrea from AWS and Bill from Healthy Careers, this setup uses generative AI to enhance job searches for healthcare professionals.

The architecture uses Elastic Container Service (ECS) to handle user queries, processed by Claude II for prompt checks and geolocation. Cleaned prompts are vectorized using Amazon's Titan model, with user search history fetched from an SQL database. Search results are stored in Elasticsearch, updating every six hours. Finally, Claude II generates a response from the search results and sends it back to the user.

I found the use of Claude II for prompt sanitization and geolocation, and the integration of multiple AI models through AWS Bedrock, particularly interesting. This setup keeps data private and provides a flexible, efficient job search experience.

Check out the video and share your thoughts!

Voice transcription and analysis on AWS#

Imagetitle

Click here to watch

Hey everyone, I recorded a reaction video to an AWS architecture for voice transcription and analysis. Presented by Nuan from AWS and Ben from Assembly AI, this system is designed to handle large-scale audio data processing.

Users upload audio data via an API to an ECS container. The data is then managed by an orchestrator that decides which models to use and in what order. The orchestrator sends tasks to SQS, which triggers various ML models running on ECS. These models handle tasks like speech-to-text conversion, sentiment analysis, and speaker labeling. Results are stored in S3 and users are notified via SNS and a Lambda function when processing is complete.

I found the use of ECS for containerized applications and the flexibility of swapping models through ECR particularly interesting. This architecture ensures scalability and efficiency, making it ideal for handling millions of requests per day.

Check out the video and share your thoughts!

GeoSpatial Data Analysis#

Imagetitle

Click here to watch

Hey everyone, I recorded a reaction video to an AWS architecture for geospatial data analysis by TCS. Presented by David John and Suryakant from TCS, this platform is used in next-gen agriculture for tasks like crop health, yield, and soil moisture analysis.

The platform uses data from satellites, AWS open data, and field agents, processing it with Lambda, Sagemaker, and PostgreSQL. Data is stored and analyzed in S3 buckets and PostgreSQL, with results made accessible via EKS-deployed UIs on EC2 instances, buffered through CloudFront for efficiency.

Key aspects include:

  • Lambda functions triggering Sagemaker jobs for machine learning.
  • Sagemaker handling extensive processing tasks.
  • PostgreSQL and S3 for storing processed data.
  • CloudFront caching data to enhance user experience.
  • I found the use of parallel Sagemaker jobs for scalability and the integration of open data for cost efficiency particularly interesting. This setup effectively meets the agricultural sector's data analysis needs.

Check out the video and share your thoughts!

Building a Self-Service Enterprise Data Engineering Platform#

Imagetitle

Click here to watch

Hey everyone, I recorded a reaction video to an AWS architecture for a self-service enterprise data engineering platform by ZS Associates. Presented by David John and Laken from ZS Associates, this platform is designed to streamline data integration, infrastructure provisioning, and data access for life sciences companies.

Key components:

  • Users and Interaction: Data engineers and analysts interact through a self-service web portal, selecting infrastructure types and providing project details. This portal makes REST requests to EKS, which creates records in PostgreSQL and triggers infrastructure provisioning via SQS.
  • Infrastructure Provisioning: EKS processes SQS messages to provision infrastructure such as EMR clusters, databases in Glue Catalog, S3 buckets, and EC2 instances with containerized services like Airflow or NiFi. IAM roles are configured for access control.
  • Data Governance and Security: All data sets are accessed through the Glue Catalog, with governance workflows requiring approval from data owners via SES notifications. EKS updates IAM roles and Ranger policies for fine-grained access control.
  • Scalability and Efficiency: EKS hosts 100+ microservices supporting workflows and UI portals. The platform handles millions of API requests and hundreds of data access requests monthly, with auto-scaling capabilities to manage costs.

This architecture effectively reduces time to market, enhances security at scale, and optimizes costs by automating data access and infrastructure provisioning. It also ensures data governance and security through controlled access and approval processes.

Check out the video and share your thoughts!

Customer Support Platform#

Imagetitle

Click here to watch

Hey everyone, I recorded a reaction video to an AWS architecture for a personalized customer support platform by Traeger. Presented by David John and Lizzy from Traeger, this system enhances customer support by leveraging data from Shopify, EventBridge, Kinesis Data Firehose, S3, Lambda, DynamoDB, and Amazon Connect.

Key components:

  • Order Processing: Customer order data from Shopify flows into EventBridge, then to Kinesis Data Firehose, which writes it to S3. An event trigger in S3 invokes a Lambda function that stores specific order metadata in DynamoDB.
  • Personalized Customer Support: When a customer calls, Amazon Connect uses Pinpoint to determine the call's origin, personalizing the language options. Connect triggers a Lambda function to query DynamoDB for customer metadata based on the phone number. This data is used to inform the customer support agent.
  • Reason for Contact: Amazon Lex bot asks the customer the reason for their call, and this information, along with customer metadata, routes the call to a specialized support queue.

I found the use of DynamoDB for storing customer metadata and the integration with Amazon Connect and Lex for personalized support particularly interesting. The architecture is scalable and ensures a personalized experience for customers.

Check out the video and share your thoughts!

League of Legends Data Platform on AWS#

Imagetitle

Click here to watch

Hey everyone, I recorded a reaction video to an AWS architecture for the data platform that powers League of Legends by Riot Games. Presented by David John and the team at Riot Games, this system handles massive amounts of data generated by millions of players worldwide.

Key components:

  • Player Interaction: Players connect to game servers globally. The game client communicates with an API running in EKS. This setup ensures low latency and optimal performance.
  • Data Ingestion: The game client and server send data about player interactions to EKS, which flows into MSK (Managed Streaming for Kafka). Local Kafka clusters buffer the data before it’s replicated to regional MSK clusters using MirrorMaker.
  • Data Processing: Spark Streaming jobs process the data from MSK and store it in Delta Lake on S3. This setup ensures efficient data handling and reduces latency in data availability.
  • Data Storage and Access: Glue serves as the data catalog, managing metadata and permissions. Data consumers, including analysts, designers, engineers, and executives, access this data through Databricks, leveraging Glue for structured queries.

I found the use of MSK and Spark for scalable data ingestion and processing particularly interesting. This architecture supports real-time analytics, allowing Riot Games to quickly assess the impact of new patches and gameplay changes.

Check out the video and share your thoughts!

Platform Connecting 70 Million Cars#

Imagetitle

Click here to watch

Hey everyone, I recorded a reaction video to an AWS architecture for a connected car platform by Mobileye. Presented by David John and the team at Mobileye, this system connects 70 million cars, collecting and processing data to offer digital services and fleet analysis.

Key components:

  • Data Collection: Cars collect anonymized data using sensors and visual inspections, sending it to a REST API and storing it in S3.
  • Data Processing: The data is pulled from S3 into SQS and processed by EKS workers, which scale according to the queue size. Processed data is stored back in S3 and further analyzed using step functions and Lambda for tasks like extracting construction zones and clustering observations.
  • Data Storage: Processed data is stored in S3, Elasticsearch, and CockroachDB. Elasticsearch handles document-based data with self-indexing, while CockroachDB supports frequent updates.
  • Data Consumption: EKS hosts a secured REST API and web application, allowing customers like city planners to access insights on pedestrian and bicycle traffic.

Future plans include enabling cloud image processing on EKS with GPU instances and focusing on cost reduction as data flow increases.

I found the use of EKS for scalable data processing and the combination of Elasticsearch and CockroachDB for different data needs particularly interesting. This architecture efficiently handles large-scale data from millions of connected cars.

Check out the video and share your thoughts!

55TB A Day: Nielsen AWS Data Architecture#

Imagetitle

Click here to watch

Hey everyone, I recorded a reaction video to an AWS architecture for Nielsen Marketing Cloud, which processes 55TB of data daily. Presented by David John, this system handles marketing segmentation data for campaigns.

Key components:

  • Data Ingestion: Marketing data comes in files, written to S3. Spark on EMR processes and transforms the data, writing the output to another S3 bucket.
  • Data Processing: Lambda functions handle the final formatting and upload the data to over 100 ad networks. Metadata about file processing is managed in a PostgreSQL RDS database.
  • Metadata Management: A work manager Lambda reads metadata from RDS, triggers processing jobs in EMR, and updates the metadata post-processing.
  • Scaling and Rate Limiting: The serverless architecture allows automatic scaling. However, rate limiting is implemented to prevent overloading ad networks, ensuring they handle data bursts smoothly.

Challenges and Solutions:

  • Scale: The system handles 250 billion events per day, scaling up and down automatically to manage peak loads.
  • Rate Limiting: To avoid overwhelming ad networks, a rate-limiting mechanism was introduced, managing data flow based on network capacity.
  • Back Pressure Management: SQS is used to buffer Lambda responses, preventing direct overload on the PostgreSQL database.

I found the use of SQS for metadata management and the serverless architecture for handling massive data loads particularly interesting. This setup ensures efficient data processing and smooth delivery to ad networks.

Check out the video and share your thoughts!

Orange Theory Fitness#

Image

Click here to watch

Hey, everybody! Today, I'm reacting to the AWS data infrastructure at Orange Theory Fitness, where they collect data from wristbands and training machines. Let's dive in and see how they manage it all.

Key Components#

  1. Local Server: Aggregates data from in-studio equipment and mobile apps, ensuring resiliency if the cloud connection is lost.
  2. API Gateway and Cognito: Handle authentication and route data to the cloud.
  3. Lambda Functions: Process data.
  4. Aurora RDS (MySQL): Stores structured data like member profiles, class bookings, and studio information.
  5. DynamoDB: Stores performance metrics and workout statistics for quick access.
  6. S3: Serves as a data lake, storing telemetry data.
  7. Kinesis Firehose: Streams telemetry data to S3.

Challenges & Solutions#

  1. Resiliency

    • Challenge: Ensure operations continue if cloud connection is lost.
    • Solution: Local server aggregates data and syncs with the cloud once the connection is restored.
  2. Data Integration

    • Challenge: Integrate data from various sources.
    • Solution: Use API Gateway and Cognito for unified authentication and data routing.
  3. Data Processing

    • Challenge: Efficiently process and store different types of data.
    • Solution: Use Lambda for processing, Aurora RDS for structured data, DynamoDB for quick access to performance metrics, and Kinesis Firehose with S3 for streaming and storing large volumes of telemetry data.

This architecture leverages AWS tools for scalability, flexibility, and resilience, making it an excellent example of a well-thought-out data infrastructure for a fitness application.

Let me know your thoughts in the comments. What do you think of this architecture? Would you have done anything differently? If you have any questions, feel free to ask. And if you're interested in learning more about data engineering, check out my academy at learndataengineering.com. See you in the next video!

More Details#

AWS Whitepapers:

https://d1.awsstatic.com/whitepapers/aws-overview.pdf

Azure#

Connect#

  • Event Hub
  • IoT Hub

Buffer#

  • Data Factory
  • Event Hub
  • RedisCache (also Store)

Processing#

  • Stream Analytics Service
  • Azure Databricks
  • Machine Learning
  • Azure Functions
  • Azure HDInsight (Hadoop PaaS)

Store#

  • Blob
  • CosmosDB
  • MariaDB
  • MySQL
  • PostgreSQL
  • SQL
  • Azure Data lake
  • Azure Storage (SQL Table?)
  • Azure Synapse Analytics

Visualize#

  • PowerBI

Containerization#

  • Virtual Machines
  • Virtual Machine Scale Sets
  • Azure Container Service (AKS)
  • Container Instances
  • Azure Kubernetes Service

Best Practices#

Advanced Analytics Architecture:

https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/advanced-analytics-on-big-data

Anomaly Detection in Real-time Data Streams:

https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/anomaly-detection-in-real-time-data-streams

Modern Data Warehouse Architecture:

https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/modern-data-warehouse

CI/CD for Containers:

https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/cicd-for-containers

Real Time Analytics on Big Data Architecture:

https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/real-time-analytics

Anomaly Detection in Real-time Data Streams:

https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/anomaly-detection-in-real-time-data-streams

IoT Architecture – Azure IoT Subsystems:

https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/azure-iot-subsystems

Tier Applications & Data for Analytics:

https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/tiered-data-for-analytics

Extract, transform, and load (ETL) using HDInsight:

https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/extract-transform-and-load-using-hdinsight

IoT using Cosmos DB:

https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/iot-using-cosmos-db

Streaming using HDInsight:

https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/streaming-using-hdinsight

GCP#

Connect#

  • Cloud IoT Core
  • App Engine
  • Cloud Dataflow

Buffer#

  • Pub/Sub

Processing#

  • Compute Engine
  • Cloud Functions
  • Specialized tools:
    • Cloud Dataflow
    • Cloud Dataproc
    • Cloud Datalab
    • Cloud Dataprep
    • Cloud Composer
  • App Engine

Store#

  • Cloud Storage
  • Cloud SQL
  • Cloud Spanner
  • Cloud Datastore
  • Cloud BigTable
  • Cloud Storage
  • Cloud Memorystore
  • BigQuery

Visualize#

Containerization#

  • Kubernetes Engine
  • Container Security

Best Practices#

Thanks to Ismail Holoubi for the following GCP links

Best practices for migrating virtual machines to Compute Engine:

https://cloud.google.com/solutions/best-practices-migrating-vm-to-compute-engine

Best practices for Cloud Storage:

https://cloud.google.com/storage/docs/best-practices

Moving a publishing workflow to BigQuery for new data insights:

https://cloud.google.com/blog/products/data-analytics/moving-a-publishing-workflow-to-bigquery-for-new-data-insights

Architecture: Optimizing large-scale ingestion of analytics events and logs:

https://cloud.google.com/solutions/architecture/optimized-large-scale-analytics-ingestion

Choosing the right architecture for global data distribution:

https://cloud.google.com/solutions/architecture/global-data-distribution

Best Practices for Operating Containers:

https://cloud.google.com/solutions/best-practices-for-operating-containers

Automating IoT Machine Learning: Bridging Cloud and Device Benefits with AI Platform:

https://cloud.google.com/solutions/automating-iot-machine-learning

- + diff --git a/docs/07-DataSources/index.html b/docs/07-DataSources/index.html index d0ef4d5..e1c6a05 100644 --- a/docs/07-DataSources/index.html +++ b/docs/07-DataSources/index.html @@ -7,7 +7,7 @@ 07-DataSources | THE DATA ENGINEERING COOKBOOK - + @@ -23,7 +23,7 @@ So, I started this section to make it easier to find good sources.

I've taken these links from articles and blog posts. Why not only link the articles? You know, these posts can go away at any time. I want to keep the links to the platforms either way.

I haven't had the chance to check each link myself. Please let me know if something isn't right.

You can find the articles on the bottom of this section to read more. They include even more data sources I haven't had time to add to this list.

Contents:#

General And Academic#

Content Marketing#

Crime#

Drugs#

Education#

Entertainment#

Environmental And Weather Data#

Financial And Economic Data#

Government And World#

Health#

Human Rights#

Labor And Employment Data#

Politics#

Retail#

Social#

Travel And Transportation#

Various Portals#

Source Articles and Blog Posts#

- + diff --git a/docs/08-InterviewQuestions/index.html b/docs/08-InterviewQuestions/index.html index ab8214f..6d14bd1 100644 --- a/docs/08-InterviewQuestions/index.html +++ b/docs/08-InterviewQuestions/index.html @@ -7,7 +7,7 @@ 08-InterviewQuestions | THE DATA ENGINEERING COOKBOOK - + @@ -46,7 +46,7 @@ considered when choosing a DB?)

  • How to choose right storage for different data consumers? It's always a tricky question

  • Apache Flink#

    • What is Flink used for?

    • Flink vs Spark?

    GitHub#

    • What are branches?

    • What are commits?

    • What's a pull request?

    Dev/Ops#

    • What is continuous integration?

    • What is continuous deployment?

    • Difference CI/CD

    Development / Agile#

    • What is Scrum?

    • What is OKR?

    • What is Jira and what is it used for?

    - + diff --git a/docs/09-BooksAndCourses/index.html b/docs/09-BooksAndCourses/index.html index f521638..076635e 100644 --- a/docs/09-BooksAndCourses/index.html +++ b/docs/09-BooksAndCourses/index.html @@ -7,7 +7,7 @@ 09-BooksAndCourses | THE DATA ENGINEERING COOKBOOK - + @@ -22,7 +22,7 @@

    09-BooksAndCourses

    Recommended Books, Courses, and Podcasts#

    Contents#

    About Books, Courses, and Podcasts#

    This is a collection of books and courses I can recommend personally. They are great for every data engineering learner.

    I either have used or own these books during my professional work.

    I also looked into every online course personally.

    If you want to buy a book or course and support my work, please use one of my links below. They are all affiliate marketing links that help me fund this passion.

    Of course all this comes at no additional expense to you, but it helps me a lot.

    You can find even more interesting books and my whole podcast equipment on my Amazon store:

    Go to the Amazon store

    PS: Don't just get a book and expect to learn everything

    • Course certificates alone help you nothing
    • Have a purpose in mind, like a small project
    • Great for use at work

    Books#

    Languages#

    Java#

    Learning Java: A Bestselling Hands-On Java Tutorial

    Python#

    Learning Python, 5th Edition

    Scala#

    Programming Scala: Scalability = Functional Programming + Objects

    Swift#

    Learning Swift: Building Apps for macOS, iOS, and Beyond

    Data Science Tools#

    Apache Spark#

    Learning Spark: Lightning-Fast Big Data Analysis

    Apache Kafka#

    Kafka Streams in Action: Real-time apps and microservices with the Kafka Streams API

    Apache Hadoop#

    Hadoop: The Definitive Guide: Storage and Analysis at Internet Scale

    Apache HBase#

    HBase: The Definitive Guide: Random Access to Your Planet-Size Data

    Business#

    The Lean Startup#

    The Lean Startup: How Today's Entrepreneurs Use Continuous Innovation to Create Radically Successful Businesses

    Zero to One#

    Zero to One: Notes on Startups, or How to Build the Future

    The Innovators Dilemma#

    The Innovator's Dilemma: When New Technologies Cause Great Firms to Fail (Management of Innovation and Change)

    Crossing the Chasm#

    Crossing the Chasm, 3rd Edition (Collins Business Essentials)

    Crush It!#

    Crush It!: Why Now Is The Time To Cash In On Your Passion

    Community Recommendations#

    Designing Data-Intensive Applications#

    "In my opinion, the knowledge contained in this book differentiates a data engineer from a software engineer or a developer. The book strikes a good balance between breadth and depth of discussion on data engineering topics, as well as the tradeoffs we must make due to working with massive amounts of data." -- David Lee on LinkedIn

    Designing Data-Intensive Applications: The Big Ideas Behind Reliable, Scalable, and Maintainable Systems

    Online Courses#

    Preparation courses#

    Course nameCourse descriptionCourse URL
    The Bits and Bytes of Computer NetworkingThis course is designed to provide a full overview of computer networking. We’ll cover everything from the fundamentals of modern networking technologies and protocols to an overview of the cloud to practical applications and network troubleshooting.https://www.coursera.org/learn/computer-networking
    Learn SQL | CodecademyIn this SQL course, you'll learn how to manage large datasets and analyze real data using the standard data management language.https://www.codecademy.com/learn/learn-sql
    Learn Python 3 | CodecademyLearn the basics of Python 3, one of the most powerful, versatile, and in-demand programming languages today.https://www.codecademy.com/learn/learn-python-3

    Data engineering courses#

    Course nameCourse descriptionCourse URL
    1. Data Engineering Basics
    Introduction to Data EngineeringIntroduction to Data Engineering with over 1 hour of videos including my journey here.https://learndataengineering.com/p/introduction-to-data-engineering
    Computer Science FundamentalsA complete guide of topics and resources you should know as a Data Engineer.https://learndataengineering.com/p/data-engineering-fundamentals
    Introduction to PythonLearn all the fundamentals of Python to start coding quickhttps://learndataengineering.com/p/introduction-to-python
    Python for Data EngineersLearn all the Python topics a Data Engineer needs even if you don't have a coding backgroundhttps://learndataengineering.com/p/python-for-data-engineers
    Docker FundamentalsLearn all the fundamental Docker concepts with hands-on exampleshttps://learndataengineering.com/p/docker-fundamentals
    Successful Job ApplicationEverything you need to get your dream job in Data Engineering.https://learndataengineering.com/p/successful-job-application
    Data Preparation & Cleaning for MLAll you need for preparing data to enable Machine Learning.https://learndataengineering.com/p/data-preparation-and-cleaning-for-ml
    2. Platform & Pipeline Design Fundamentals
    Data Platform And Pipeline DesignLearn how to build data pipelines with templates and examples for Azure, GCP and Hadoop.https://learndataengineering.com/p/data-pipeline-design
    Platform & Pipelines SecurityLearn the important security fundamentals for Data Engineeringhttps://learndataengineering.com/p/platform-pipeline-security
    Choosing Data StoresLearn the different types of data stores and when to use which.https://learndataengineering.com/p/choosing-data-stores
    Schema Design Data StoresLearn how to design schemas for SQL, NoSQL and Data Warehouses.https://learndataengineering.com/p/data-modeling
    3. Fundamental Tools
    Building APIs with FastAPILearn the fundamentals of designing, creating and deploying APIs with FastAPI and Dockerhttps://learndataengineering.com/p/apis-with-fastapi-course
    Apache Kafka FundamentalsLearn the fundamentals of Apache Kafkahttps://learndataengineering.com/p/apache-kafka-fundamentals
    Apache Spark FundamentalsApache Spark quick start course in Python with Jupyter notebooks, DataFrames, SparkSQL and RDDs.https://learndataengineering.com/p/learning-apache-spark-fundamentals
    Data Engineering on DatabricksEverything you need to get started with Databricks. From setup to building ETL pipelines & warehousing.https://learndataengineering.com/p/data-engineering-on-databricks
    MongoDB FundamentalsLearn how to use MongoDBhttps://learndataengineering.com/p/mongodb-fundamentals-course
    Log Analysis with ElasticsearchLearn how to monitor and debug your data pipelineshttps://learndataengineering.com/p/log-analysis-with-elasticsearch
    Airflow Workflow OrchestrationLearn how to orchestrate your data pipelines with Apache Airflowhttps://learndataengineering.com/p/learn-apache-airflow
    Snowflake for Data EngineersEverything you need to get started with Snowflakehttps://learndataengineering.com/p/snowflake-for-data-engineers
    dbt for Data EngineersEverything you need to work with dbt and Snowflakehttps://learndataengineering.com/p/dbt-for-data-engineers
    4. Full Hands-On Example Projects
    Data Engineering on AWSFull 5 hours course with complete example project. Building stream and batch processing pipelines on AWS.https://learndataengineering.com/p/data-engineering-on-aws
    Data Engineering on AzureIngest, Store, Process, Serve and Visualize Streams of Data by Building Streaming Data Pipelines in Azure.https://learndataengineering.com/p/build-streaming-data-pipelines-in-azure
    Data Engineering on GCPEverything you need to start with Google Cloud.https://learndataengineering.com/p/data-engineering-on-gcp
    Modern Data Warehouses & Data LakesHow to integrate a Data Lake with a Data Warehouse and query data directly from fileshttps://learndataengineering.com/p/modern-data-warehouses
    Machine Learning & Containerization On AWSBuild a app that analyzes the sentiment of tweets and visualizing them on a user interface hosted as containerhttps://learndataengineering.com/p/ml-on-aws
    Contact Tracing with ElasticsearchTrack 100,000 users in San Francisco using Elasticsearch and an interactive Streamlit user interfacehttps://learndataengineering.com/p/contact-tracing-with-elasticsearch
    Document Streaming ProjectDocument Streaming with FastAPI, Kafka, Spark Streaming, MongoDB and Streamlithttps://learndataengineering.com/p/document-streaming
    Storing & Visualizing Time Series Data with InfluxDB and GrafanaLearn how to use InfluxDB to store time series data and visualize interactive dashboards with Grafanahttps://learndataengineering.com/p/time-series-influxdb-grafana
    Data Engineering with HadoopHadoop Project with HDFS, YARN, MapReduce, Hive and Sqoop!https://learndataengineering.com/p/data-engineering-with-hadoop
    Dockerized ETLLearn how quickly set up a simple ETL script with AWS TDengine & Grafanahttps://learndataengineering.com/p/timeseries-etl-with-aws-tdengine-grafana

    Certifications#

    Here's a list of great certifications that you can do on AWS and Azure. We left out GCP here, because the adoption of AWS and Azure is a lot higher and that's why I recommend to start with one of these. The costs are usually for doing the certification tests. We also added the level and prerequisites to make it easier for you make the decision which one fits for you.

    PlatformCertification NamePriceLevelPrerequisite ExperienceURL
    AWSAWS Certified Cloud Practitioner (maybe)100BeginnerFamiliarity with the AWS platform is recommended but not required.Link
    AWSAWS Certified Solutions Architect300ExpertAWS Certified Solutions Architect - Professional is intended for individuals with two or more years of hands-on experience designing and deploying cloud architecture on AWS.Link
    AWSAWS Certified Solutions Architect150IntermediateThis is an ideal starting point for candidates with AWS Cloud or strong on-premises IT experience. This exam does not require deep hands-on coding experience, although familiarity with basic programming concepts would be an advantage.Link
    AWSAWS Certified Data Engineer150IntermediateThe ideal candidate for this exam has the equivalent of 2-3 years of experience in data engineering or data architecture and a minimum of 1-2 years of hands-on experience with AWS services.Link
    AzureMicrosoft Certified: Azure Cosmos DB Developer Specialty165IntermediateLink
    AzureMicrosoft Certified: Azure Data Engineer Associate - DP 203165IntermediateLink
    AzureMicrosoft Certified: Azure Data Fundamentals99BeginnerLink
    AzureMicrosoft Certified: Azure Database Administrator Associate165IntermediateLink
    AzureMicrosoft Certified: Azure Developer Associate165IntermediateLink
    AzureMicrosoft Certified: Azure Fundamentals99BeginnerLink
    AzureMicrosoft Certified: Azure Solutions Architect Expert165ExpertMicrosoft Certified: Azure Administrator Associate certificationLink
    AzureMicrosoft Certified: Fabric Analytics Engineer Associate165IntermediateLink
    AzureMicrosoft Certified: Fabric Data Engineer Associate165IntermediateLink
    AzureMicrosoft Certified: Power BI Data Analyst Associate165IntermediateLink

    Podcasts#

    Top five podcasts by the number of episodes created.

    Super Data Science#

    The latest machine learning, A.I., and data career topics from across both academia and industry are brought to you by host Dr. Jon Krohn on the Super Data Science Podcast.

    Data Skeptic#

    The Data Skeptic Podcast features interviews and discussion of topics related to data science, statistics, machine learning, artificial intelligence and the like, all from the perspective of applying critical thinking and the scientific method to evaluate the veracity of claims and efficacy of approaches.

    Data Engineering Podcast#

    This show goes behind the scenes for the tools, techniques, and difficulties associated with the discipline of data engineering. Databases, workflows, automation, and data manipulation are just some of the topics that you will find here.

    Roaring Elephant BiteSized Big Tech#

    A weekly community podcast about Big Technology with a focus on Open Source, Advanced Analytics and other modern magic.

    SQL Data Partners Podcast#

    Hosted by Carlos L Chacon, the SQL Data Partners Podcast focuses on Microsoft data platform related topics mixed with a sprinkling of professional development. Carlos and guests discuss new and familiar features and ideas and how you might apply them in your environments.

    Complete list#

    Host namePodcast nameAccess podcast
    Jon KrohnSuper Data Sciencehttps://www.superdatascience.com/podcast
    Kyle PolichData Skeptichttps://dataskeptic.com/
    Tobias MaceyData Engineering Podcasthttps://www.dataengineeringpodcast.com/
    Dave RussellRoaring Elephant - Bite-Sized Big Techhttps://roaringelephant.org/
    Carlos L ChaconSQL Data Partners Podcasthttps://sqldatapartners.com/podcast/
    Jason HimmelsteinBIFocal - Clarifying Business Intelligencehttps://bifocal.show/
    Scott HirlemanData Mesh Radiohttps://daappod.com/data-mesh-radio/
    Jonathan SchwabishPolicyVizhttps://policyviz.com/podcast/
    Al MartinMaking Data Simplehttps://www.ibm.com/blogs/journey-to-ai/2021/02/making-data-simple-this-week-we-continue-our-discussion-on-data-framework-and-what-is-meant-by-data-framework/
    John David AriansenHow to Get an Analytics Jobhttps://www.silvertoneanalytics.com/how-to-get-an-analytics-job/
    Moritz StefanerData Storieshttps://datastori.es/
    Hilary ParkerNot So Standard Deviationshttps://nssdeviations.com/
    Ben LoricaThe Data Exchange with Ben Loricahttps://thedataexchange.media/author/bglorica/
    Juan SequedaCatalog & Cocktailshttps://data.world/resources/podcasts/
    Wayne EckersonSecrets of Data Analytics Leadershttps://www.eckerson.com/podcasts/secrets-of-data-analytics-leaders
    Guy GlantserSQL Server Radiohttps://www.sqlserverradio.com/
    Eitan BluminSQL Server Radiohttps://www.sqlserverradio.com/
    Jason TanThe Analytics Showhttps://ddalabs.ai/the-analytics-show/
    Hugo Bowne-AndersonDataFramedhttps://www.datacamp.com/podcast
    Kostas PardalisThe Data Stack Showhttps://datastackshow.com/
    Eric DoddsThe Data Stack Showhttps://datastackshow.com/
    Catherine KingThe Business of Data Podcasthttps://podcasts.apple.com/gb/podcast/the-business-of-data-podcast/id1528796448
    The Business of Datahttps://business-of-data.com/podcasts/
    James LeDatacasthttps://datacast.simplecast.com/
    Mike DelgadoDataTalkhttps://podcasts.apple.com/us/podcast/datatalk/id1398548129
    Matt HousleyMonday Morning Data Chathttps://podcasts.apple.com/us/podcast/monday-morning-data-chat/id1565154727
    Francesco GadaletaData Science at Homehttps://datascienceathome.com/
    Alli TorbanData Viz Todayhttps://dataviztoday.com/
    Steve JonesVoice of the DBAhttps://voiceofthedba.com/
    Lea PicaThe Present Beyond Measure Show: Data Storytelling, Presentation & Visualizationhttps://leapica.com/podcast/
    Samir SharmaThe Data Strategy Showhttps://podcasts.apple.com/us/podcast/the-data-strategy-show/id1515194422
    Cindi HowsonThe Data Chiefhttps://www.thoughtspot.com/data-chief/podcast
    Cole Nussbaumer Knaflicstorytelling with data podcasthttps://storytellingwithdata.libsyn.com/
    Margot GerritsenWomen in Data Sciencehttps://www.widsconference.org/podcast.html
    Jonas ChristensenLeaders of Analyticshttps://www.leadersofanalytics.com/episode/the-future-of-analytics-leadership-with-john-thompson
    Matt BradyZUMA: Data For Goodhttps://www.youtube.com/@zuma-dataforgood
    Julia SchottensteinThe Analytics Engineering Podcasthttps://roundup.getdbt.com/s/the-analytics-engineering-podcast
    Data Unlockedhttps://dataunlocked.buzzsprout.com/
    Boris JabesThe Sequel Showhttps://www.thesequelshow.com/
    Data Radicalshttps://www.alation.com/podcast/
    Nicola AskhamThe Data Governancehttps://www.nicolaaskham.com/podcast
    Boaz FarkashThe Data Engineering Showhttps://www.dataengineeringshow.com/
    Bob HaffnerThe Engineering Side of Datahttps://podcasts.apple.com/us/podcast/the-engineering-side-of-data/id1566999533
    Dan LinstedtData Vault Alliancehttps://datavaultalliance.com/category/news/podcasts/
    Dustin SchimekData Ideashttps://podcasts.apple.com/us/podcast/data-ideas/id1650322207
    Alex MercedThe datanationhttps://podcasts.apple.com/be/podcast/the-datanation-podcast-podcast-for-data-engineers/id1608638822
    Thomas BustosLet's Talk AIhttps://www.youtube.com/@lets-talk-ai
    Jahanvee NarangDecoding Data Analyticshttps://www.youtube.com/@decodingdataanalytics/videos
    - + diff --git a/docs/10-Updates/index.html b/docs/10-Updates/index.html index 29724c6..c8fecfe 100644 --- a/docs/10-Updates/index.html +++ b/docs/10-Updates/index.html @@ -7,7 +7,7 @@ 10-Updates | THE DATA ENGINEERING COOKBOOK - + @@ -15,13 +15,13 @@ - +
    -

    10-Updates

    Updates#

    What's new? Here you can find a list of all the updates with links to the sections

    • 2024-11-23
      • Prepared a GenAI RAG example project that you can run on your own computer without internet. It uses Ollama with Mistral model and ElasticSearch. Working on a way of creating embeddings from pdf files and inserting them into ElsaticSearch for queries click here
    • 2024-11-23
      • Added an overview of AWS and Azure cloud certifications for Data Engineers. From beginners to experts click here
    • 2024-07-31
      • Added 10 platform architecture react videos I did to the "Best Practices" section. This way you get a better feeling of what companies are doing and which tools they use click here
    • 2024-07-17
      • Added 20 API interview questoins and their answers click here
      • Added 10 Python interview questions and their answers click here
    • 2024-07-08
      • Added large article about Snowflake and dbt for Data Engineers click here
      • Added new secton "Analytical Data Stores" to Advanced skills with the Snowflake & dbt infos.
      • Put SQL and NoSQL datastores into a new section "Transactional Data Stores"
    • 2024-03-20
      • Added roadmap for Software Engineers / Computer Scientists click here
      • Added many questions and answers from my interview on the Super Data Science Podcast (plus links to YouTube and the Podcast) click here
    • 2024-03-13
      • Added "How to become a Senior Data Engineer" live stream series as a blog post with images shown in the live streams and the links to the videos. click here
    • 2024-03-08
      • Included Data Engineering skills matrix into the introduction with link to the live stream. click here
    • 2024-03-01
      • Added updates section
      • Reworked the Hands-on courses section with 5 free courses / tutorials from Andreas on YouTube click here
    • 2024-02-28
      • Added Data Engineering Roadmap for Data Scientists: click here
    • 2024-02-25
      • Data Engineering Roadmap for Software Engineers: click here
    • 2024-02-20
      • Data Engineering Roadmap for Data Analysts: click here
    +

    10-Updates

    Updates#

    What's new? Here you can find a list of all the updates with links to the sections

    • 2024-11-23
      • Prepared a GenAI RAG example project that you can run on your own computer without internet. It uses Ollama with Mistral model and Elasticsearch. Working on a way of creating embeddings from pdf files and inserting them into Elsaticsearch for queries click here
    • 2024-11-23
      • Added an overview of AWS and Azure cloud certifications for Data Engineers. From beginners to experts click here
    • 2024-07-31
      • Added 10 platform architecture react videos I did to the "Best Practices" section. This way you get a better feeling of what companies are doing and which tools they use click here
    • 2024-07-17
      • Added 20 API interview questoins and their answers click here
      • Added 10 Python interview questions and their answers click here
    • 2024-07-08
      • Added large article about Snowflake and dbt for Data Engineers click here
      • Added new secton "Analytical Data Stores" to Advanced skills with the Snowflake & dbt infos.
      • Put SQL and NoSQL datastores into a new section "Transactional Data Stores"
    • 2024-03-20
      • Added roadmap for Software Engineers / Computer Scientists click here
      • Added many questions and answers from my interview on the Super Data Science Podcast (plus links to YouTube and the Podcast) click here
    • 2024-03-13
      • Added "How to become a Senior Data Engineer" live stream series as a blog post with images shown in the live streams and the links to the videos. click here
    • 2024-03-08
      • Included Data Engineering skills matrix into the introduction with link to the live stream. click here
    • 2024-03-01
      • Added updates section
      • Reworked the Hands-on courses section with 5 free courses / tutorials from Andreas on YouTube click here
    • 2024-02-28
      • Added Data Engineering Roadmap for Data Scientists: click here
    • 2024-02-25
      • Data Engineering Roadmap for Software Engineers: click here
    • 2024-02-20
      • Data Engineering Roadmap for Data Analysts: click here
    - + @@ -29,6 +29,6 @@ - + \ No newline at end of file diff --git a/index.html b/index.html index b9bdafc..26db88f 100644 --- a/index.html +++ b/index.html @@ -7,7 +7,7 @@ Hello from THE DATA ENGINEERING COOKBOOK | THE DATA ENGINEERING COOKBOOK - + @@ -17,7 +17,7 @@

    THE DATA ENGINEERING COOKBOOK

    by ANDREAS KRETZ

    - +