diff --git a/404.html b/404.html
index 419baf6..720138b 100644
--- a/404.html
+++ b/404.html
@@ -7,14 +7,14 @@
 <link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="THE DATA ENGINEERING COOKBOOK Blog RSS Feed">
 <link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="THE DATA ENGINEERING COOKBOOK Blog Atom Feed"><title data-react-helmet="true">Page Not Found | THE DATA ENGINEERING COOKBOOK</title><meta data-react-helmet="true" property="og:title" content="Page Not Found | THE DATA ENGINEERING COOKBOOK"><meta data-react-helmet="true" name="twitter:card" content="summary_large_image"><meta data-react-helmet="true" property="og:url" content="http://cookbook.learndataengineering.com/404.html"><meta data-react-helmet="true" name="docusaurus_locale" content="en"><meta data-react-helmet="true" name="docusaurus_tag" content="default"><link data-react-helmet="true" rel="shortcut icon" href="/images/CookbookCover.jpg"><link data-react-helmet="true" rel="canonical" href="http://cookbook.learndataengineering.com/404.html"><link data-react-helmet="true" rel="alternate" href="http://cookbook.learndataengineering.com/404.html" hreflang="x-default"><link rel="stylesheet" href="/assets/css/styles.5ba6ab3f.css">
 <link rel="preload" href="/assets/js/styles.4aacfb65.js" as="script">
-<link rel="preload" href="/assets/js/runtime~main.33a05ad1.js" as="script">
+<link rel="preload" href="/assets/js/runtime~main.96f9bfc8.js" as="script">
 <link rel="preload" href="/assets/js/main.da980c98.js" as="script">
 </head>
 <body>
 <script>!function(){function e(e){document.documentElement.setAttribute("data-theme",e)}var t=function(){var e=null;try{e=localStorage.getItem("theme")}catch(e){}return e}();null!==t?e(t):window.matchMedia("(prefers-color-scheme: dark)").matches?e("dark"):(window.matchMedia("(prefers-color-scheme: light)").matches,e("light"))}()</script><div id="__docusaurus">
 <nav aria-label="Skip navigation links"><button type="button" tabindex="0" class="skipToContent_1oUP">Skip to main content</button></nav><nav class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><div aria-label="Navigation bar toggle" class="navbar__toggle" role="button" tabindex="0"><svg aria-label="Menu" width="30" height="30" viewBox="0 0 30 30" role="img" focusable="false"><title>Menu</title><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></div><a class="navbar__brand" href="/"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--light_3UqQ navbar__logo"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--dark_hz6m navbar__logo"><strong class="navbar__title">Data Engineering Cookbook</strong></a><a class="navbar__item navbar__link" href="/docs/01-Introduction">Cookbook</a></div><div class="navbar__items navbar__items--right"><a href="https://learndataengineering.com/" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">Data Engineering Academy</a><a href="https://medium.com/plumbersofdatascience" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">Plumbers Of Data Science</a><a href="https://github.com/andkret/Cookbook" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">GitHub</a></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div><div class="navbar-sidebar"><div class="navbar-sidebar__brand"><a class="navbar__brand" href="/"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--light_3UqQ navbar__logo"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--dark_hz6m navbar__logo"><strong class="navbar__title">Data Engineering Cookbook</strong></a></div><div class="navbar-sidebar__items"><div class="menu"><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" href="/docs/01-Introduction">Cookbook</a></li><li class="menu__list-item"><a href="https://learndataengineering.com/" target="_blank" rel="noopener noreferrer" class="menu__link">Data Engineering Academy</a></li><li class="menu__list-item"><a href="https://medium.com/plumbersofdatascience" target="_blank" rel="noopener noreferrer" class="menu__link">Plumbers Of Data Science</a></li><li class="menu__list-item"><a href="https://github.com/andkret/Cookbook" target="_blank" rel="noopener noreferrer" class="menu__link">GitHub</a></li></ul></div></div></div></nav><div class="main-wrapper"><main class="container margin-vert--xl"><div class="row"><div class="col col--6 col--offset-3"><h1 class="hero__title">Page Not Found</h1><p>We could not find what you were looking for.</p><p>Please contact the owner of the site that linked you to the original URL and let them know their link is broken.</p></div></div></main></div><footer class="footer"><div class="container"><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2024 Andreas Kretz. Built by Kristijan Bakaric with Docusaurus.</div></div></div></footer></div>
 <script src="/assets/js/styles.4aacfb65.js"></script>
-<script src="/assets/js/runtime~main.33a05ad1.js"></script>
+<script src="/assets/js/runtime~main.96f9bfc8.js"></script>
 <script src="/assets/js/main.da980c98.js"></script>
 </body>
 </html>
\ No newline at end of file
diff --git a/assets/js/a4b6b237.47726173.js b/assets/js/a4b6b237.394c5a42.js
similarity index 98%
rename from assets/js/a4b6b237.47726173.js
rename to assets/js/a4b6b237.394c5a42.js
index f62d3c0..115f615 100644
--- a/assets/js/a4b6b237.47726173.js
+++ b/assets/js/a4b6b237.394c5a42.js
@@ -1 +1 @@
-(window.webpackJsonp=window.webpackJsonp||[]).push([[8],{76:function(e,t,a){"use strict";a.r(t),a.d(t,"frontMatter",(function(){return c})),a.d(t,"metadata",(function(){return o})),a.d(t,"toc",(function(){return l})),a.d(t,"default",(function(){return b}));var n=a(3),r=a(7),i=(a(0),a(88)),c={},o={unversionedId:"10-Updates",id:"10-Updates",isDocsHomePage:!1,title:"10-Updates",description:"Updates",source:"@site/docs/10-Updates.md",slug:"/10-Updates",permalink:"/docs/10-Updates",version:"current",sidebar:"projectsSidebar",previous:{title:"09-BooksAndCourses",permalink:"/docs/09-BooksAndCourses"}},l=[],s={toc:l};function b(e){var t=e.components,a=Object(r.a)(e,["components"]);return Object(i.b)("wrapper",Object(n.a)({},s,a,{components:t,mdxType:"MDXLayout"}),Object(i.b)("h1",{id:"updates"},"Updates"),Object(i.b)("p",null,"What's new? Here you can find a list of all the updates with links to the sections"),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-11-23"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Prepared a GenAI RAG example project that you can run on your own computer without internet. It uses Ollama with Mistral model and ElasticSearch. Working on a way of creating embeddings from pdf files and inserting them into ElsaticSearch for queries ",Object(i.b)("a",{parentName:"li",href:"/docs/04-HandsOnCourse#genai-retrieval-augmented-generation-with-ollama-and-elasticsearch"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-11-23"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Added an overview of AWS and Azure cloud certifications for Data Engineers. From beginners to experts ",Object(i.b)("a",{parentName:"li",href:"/docs/09-BooksAndCourses#Certifications"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-07-31"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},'Added 10 platform architecture react videos I did to the "Best Practices" section. This way you get a better feeling of what companies are doing and which tools they use ',Object(i.b)("a",{parentName:"li",href:"/docs/06-BestPracticesCloud#best-practices"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-07-17"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Added 20 API interview questoins and their answers ",Object(i.b)("a",{parentName:"li",href:"/docs/08-InterviewQuestions#apis"},"click here")),Object(i.b)("li",{parentName:"ul"},"Added 10 Python interview questions and their answers ",Object(i.b)("a",{parentName:"li",href:"/docs/03-AdvancedSkills#python"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-07-08"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Added large article about Snowflake and dbt for Data Engineers ",Object(i.b)("a",{parentName:"li",href:"/docs/03-AdvancedSkills#analytical-data-stores"},"click here")),Object(i.b)("li",{parentName:"ul"},'Added new secton "Analytical Data Stores" to Advanced skills with the Snowflake & dbt infos.'),Object(i.b)("li",{parentName:"ul"},'Put SQL and NoSQL datastores into a new section "Transactional Data Stores"')))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-03-20"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Added roadmap for Software Engineers / Computer Scientists ",Object(i.b)("a",{parentName:"li",href:"/docs/01-Introduction#roadmap-for-software-engineers"},"click here")),Object(i.b)("li",{parentName:"ul"},"Added many questions and answers from my interview on the Super Data Science Podcast (plus links to YouTube and the Podcast) ",Object(i.b)("a",{parentName:"li",href:"/docs/01-Introduction#Interview-with-Andreas-on-the-Super-Data-Science-Podcast"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-03-13"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},'Added "How to become a Senior Data Engineer" live stream series as a blog post with images shown in the live streams and the links to the videos. ',Object(i.b)("a",{parentName:"li",href:"/docs/01-Introduction#how-to-become-a-senior-data-engineer"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-03-08"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Included Data Engineering skills matrix into the introduction with link to the live stream. ",Object(i.b)("a",{parentName:"li",href:"/docs/01-Introduction#data-engineers-skills-matrix"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-03-01"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Added updates section"),Object(i.b)("li",{parentName:"ul"},"Reworked the Hands-on courses section with 5 free courses / tutorials from Andreas on YouTube ",Object(i.b)("a",{parentName:"li",href:"/docs/04-HandsOnCourse"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-02-28"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Added Data Engineering Roadmap for Data Scientists: ",Object(i.b)("a",{parentName:"li",href:"/docs/01-Introduction#roadmap-for-data-scientists"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-02-25"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Data Engineering Roadmap for Software Engineers: ",Object(i.b)("a",{parentName:"li",href:"/docs/01-Introduction#roadmap-for-software-engineers"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-02-20"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Data Engineering Roadmap for Data Analysts: ",Object(i.b)("a",{parentName:"li",href:"/docs/01-Introduction#roadmap-for-data-analysts"},"click here"))))))}b.isMDXComponent=!0},88:function(e,t,a){"use strict";a.d(t,"a",(function(){return u})),a.d(t,"b",(function(){return m}));var n=a(0),r=a.n(n);function i(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function c(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,n)}return a}function o(e){for(var t=1;t<arguments.length;t++){var a=null!=arguments[t]?arguments[t]:{};t%2?c(Object(a),!0).forEach((function(t){i(e,t,a[t])})):Object.getOwnPropertyDescriptors?Object.defineProperties(e,Object.getOwnPropertyDescriptors(a)):c(Object(a)).forEach((function(t){Object.defineProperty(e,t,Object.getOwnPropertyDescriptor(a,t))}))}return e}function l(e,t){if(null==e)return{};var a,n,r=function(e,t){if(null==e)return{};var a,n,r={},i=Object.keys(e);for(n=0;n<i.length;n++)a=i[n],t.indexOf(a)>=0||(r[a]=e[a]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(n=0;n<i.length;n++)a=i[n],t.indexOf(a)>=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(r[a]=e[a])}return r}var s=r.a.createContext({}),b=function(e){var t=r.a.useContext(s),a=t;return e&&(a="function"==typeof e?e(t):o(o({},t),e)),a},u=function(e){var t=b(e.components);return r.a.createElement(s.Provider,{value:t},e.children)},d={inlineCode:"code",wrapper:function(e){var t=e.children;return r.a.createElement(r.a.Fragment,{},t)}},p=r.a.forwardRef((function(e,t){var a=e.components,n=e.mdxType,i=e.originalType,c=e.parentName,s=l(e,["components","mdxType","originalType","parentName"]),u=b(a),p=n,m=u["".concat(c,".").concat(p)]||u[p]||d[p]||i;return a?r.a.createElement(m,o(o({ref:t},s),{},{components:a})):r.a.createElement(m,o({ref:t},s))}));function m(e,t){var a=arguments,n=t&&t.mdxType;if("string"==typeof e||n){var i=a.length,c=new Array(i);c[0]=p;var o={};for(var l in t)hasOwnProperty.call(t,l)&&(o[l]=t[l]);o.originalType=e,o.mdxType="string"==typeof e?e:n,c[1]=o;for(var s=2;s<i;s++)c[s]=a[s];return r.a.createElement.apply(null,c)}return r.a.createElement.apply(null,a)}p.displayName="MDXCreateElement"}}]);
\ No newline at end of file
+(window.webpackJsonp=window.webpackJsonp||[]).push([[8],{76:function(e,t,a){"use strict";a.r(t),a.d(t,"frontMatter",(function(){return c})),a.d(t,"metadata",(function(){return o})),a.d(t,"toc",(function(){return l})),a.d(t,"default",(function(){return b}));var n=a(3),r=a(7),i=(a(0),a(88)),c={},o={unversionedId:"10-Updates",id:"10-Updates",isDocsHomePage:!1,title:"10-Updates",description:"Updates",source:"@site/docs/10-Updates.md",slug:"/10-Updates",permalink:"/docs/10-Updates",version:"current",sidebar:"projectsSidebar",previous:{title:"09-BooksAndCourses",permalink:"/docs/09-BooksAndCourses"}},l=[],s={toc:l};function b(e){var t=e.components,a=Object(r.a)(e,["components"]);return Object(i.b)("wrapper",Object(n.a)({},s,a,{components:t,mdxType:"MDXLayout"}),Object(i.b)("h1",{id:"updates"},"Updates"),Object(i.b)("p",null,"What's new? Here you can find a list of all the updates with links to the sections"),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-11-23"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Prepared a GenAI RAG example project that you can run on your own computer without internet. It uses Ollama with Mistral model and Elasticsearch. Working on a way of creating embeddings from pdf files and inserting them into Elsaticsearch for queries ",Object(i.b)("a",{parentName:"li",href:"/docs/04-HandsOnCourse#genai-retrieval-augmented-generation-with-ollama-and-elasticsearch"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-11-23"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Added an overview of AWS and Azure cloud certifications for Data Engineers. From beginners to experts ",Object(i.b)("a",{parentName:"li",href:"/docs/09-BooksAndCourses#Certifications"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-07-31"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},'Added 10 platform architecture react videos I did to the "Best Practices" section. This way you get a better feeling of what companies are doing and which tools they use ',Object(i.b)("a",{parentName:"li",href:"/docs/06-BestPracticesCloud#best-practices"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-07-17"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Added 20 API interview questoins and their answers ",Object(i.b)("a",{parentName:"li",href:"/docs/08-InterviewQuestions#apis"},"click here")),Object(i.b)("li",{parentName:"ul"},"Added 10 Python interview questions and their answers ",Object(i.b)("a",{parentName:"li",href:"/docs/03-AdvancedSkills#python"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-07-08"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Added large article about Snowflake and dbt for Data Engineers ",Object(i.b)("a",{parentName:"li",href:"/docs/03-AdvancedSkills#analytical-data-stores"},"click here")),Object(i.b)("li",{parentName:"ul"},'Added new secton "Analytical Data Stores" to Advanced skills with the Snowflake & dbt infos.'),Object(i.b)("li",{parentName:"ul"},'Put SQL and NoSQL datastores into a new section "Transactional Data Stores"')))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-03-20"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Added roadmap for Software Engineers / Computer Scientists ",Object(i.b)("a",{parentName:"li",href:"/docs/01-Introduction#roadmap-for-software-engineers"},"click here")),Object(i.b)("li",{parentName:"ul"},"Added many questions and answers from my interview on the Super Data Science Podcast (plus links to YouTube and the Podcast) ",Object(i.b)("a",{parentName:"li",href:"/docs/01-Introduction#Interview-with-Andreas-on-the-Super-Data-Science-Podcast"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-03-13"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},'Added "How to become a Senior Data Engineer" live stream series as a blog post with images shown in the live streams and the links to the videos. ',Object(i.b)("a",{parentName:"li",href:"/docs/01-Introduction#how-to-become-a-senior-data-engineer"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-03-08"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Included Data Engineering skills matrix into the introduction with link to the live stream. ",Object(i.b)("a",{parentName:"li",href:"/docs/01-Introduction#data-engineers-skills-matrix"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-03-01"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Added updates section"),Object(i.b)("li",{parentName:"ul"},"Reworked the Hands-on courses section with 5 free courses / tutorials from Andreas on YouTube ",Object(i.b)("a",{parentName:"li",href:"/docs/04-HandsOnCourse"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-02-28"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Added Data Engineering Roadmap for Data Scientists: ",Object(i.b)("a",{parentName:"li",href:"/docs/01-Introduction#roadmap-for-data-scientists"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-02-25"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Data Engineering Roadmap for Software Engineers: ",Object(i.b)("a",{parentName:"li",href:"/docs/01-Introduction#roadmap-for-software-engineers"},"click here"))))),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},Object(i.b)("strong",{parentName:"li"},"2024-02-20"),Object(i.b)("ul",{parentName:"li"},Object(i.b)("li",{parentName:"ul"},"Data Engineering Roadmap for Data Analysts: ",Object(i.b)("a",{parentName:"li",href:"/docs/01-Introduction#roadmap-for-data-analysts"},"click here"))))))}b.isMDXComponent=!0},88:function(e,t,a){"use strict";a.d(t,"a",(function(){return u})),a.d(t,"b",(function(){return m}));var n=a(0),r=a.n(n);function i(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function c(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,n)}return a}function o(e){for(var t=1;t<arguments.length;t++){var a=null!=arguments[t]?arguments[t]:{};t%2?c(Object(a),!0).forEach((function(t){i(e,t,a[t])})):Object.getOwnPropertyDescriptors?Object.defineProperties(e,Object.getOwnPropertyDescriptors(a)):c(Object(a)).forEach((function(t){Object.defineProperty(e,t,Object.getOwnPropertyDescriptor(a,t))}))}return e}function l(e,t){if(null==e)return{};var a,n,r=function(e,t){if(null==e)return{};var a,n,r={},i=Object.keys(e);for(n=0;n<i.length;n++)a=i[n],t.indexOf(a)>=0||(r[a]=e[a]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(n=0;n<i.length;n++)a=i[n],t.indexOf(a)>=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(r[a]=e[a])}return r}var s=r.a.createContext({}),b=function(e){var t=r.a.useContext(s),a=t;return e&&(a="function"==typeof e?e(t):o(o({},t),e)),a},u=function(e){var t=b(e.components);return r.a.createElement(s.Provider,{value:t},e.children)},d={inlineCode:"code",wrapper:function(e){var t=e.children;return r.a.createElement(r.a.Fragment,{},t)}},p=r.a.forwardRef((function(e,t){var a=e.components,n=e.mdxType,i=e.originalType,c=e.parentName,s=l(e,["components","mdxType","originalType","parentName"]),u=b(a),p=n,m=u["".concat(c,".").concat(p)]||u[p]||d[p]||i;return a?r.a.createElement(m,o(o({ref:t},s),{},{components:a})):r.a.createElement(m,o({ref:t},s))}));function m(e,t){var a=arguments,n=t&&t.mdxType;if("string"==typeof e||n){var i=a.length,c=new Array(i);c[0]=p;var o={};for(var l in t)hasOwnProperty.call(t,l)&&(o[l]=t[l]);o.originalType=e,o.mdxType="string"==typeof e?e:n,c[1]=o;for(var s=2;s<i;s++)c[s]=a[s];return r.a.createElement.apply(null,c)}return r.a.createElement.apply(null,a)}p.displayName="MDXCreateElement"}}]);
\ No newline at end of file
diff --git a/assets/js/b42f3af8.d55ebf91.js b/assets/js/b42f3af8.f460278c.js
similarity index 99%
rename from assets/js/b42f3af8.d55ebf91.js
rename to assets/js/b42f3af8.f460278c.js
index a995a00..f6d7c7c 100644
--- a/assets/js/b42f3af8.d55ebf91.js
+++ b/assets/js/b42f3af8.f460278c.js
@@ -1 +1 @@
-(window.webpackJsonp=window.webpackJsonp||[]).push([[9],{77:function(e,t,a){"use strict";a.r(t),a.d(t,"frontMatter",(function(){return r})),a.d(t,"metadata",(function(){return l})),a.d(t,"toc",(function(){return s})),a.d(t,"default",(function(){return d}));var n=a(3),i=a(7),o=(a(0),a(88)),r={},l={unversionedId:"04-HandsOnCourse",id:"04-HandsOnCourse",isDocsHomePage:!1,title:"04-HandsOnCourse",description:"Data Engineering Course: Building A Data Platform",source:"@site/docs/04-HandsOnCourse.md",slug:"/04-HandsOnCourse",permalink:"/docs/04-HandsOnCourse",version:"current",sidebar:"projectsSidebar",previous:{title:"03-AdvancedSkills",permalink:"/docs/03-AdvancedSkills"},next:{title:"05-CaseStudies",permalink:"/docs/05-CaseStudies"}},s=[{value:"Contents",id:"contents",children:[]},{value:"GenAI Retrieval Augmented Generation with Ollama and ElasticSearch",id:"genai-retrieval-augmented-generation-with-ollama-and-elasticsearch",children:[{value:"Install Ollama",id:"install-ollama",children:[]},{value:"Install Elasticsearch &amp; Kibana (Docker)",id:"install-elasticsearch--kibana-docker",children:[]},{value:"Setup the virtual Python environment",id:"setup-the-virtual-python-environment",children:[]},{value:"Install required libraries (do one at a time so you see errors):",id:"install-required-libraries-do-one-at-a-time-so-you-see-errors",children:[]},{value:"Write the data to Elasticsearch",id:"write-the-data-to-elasticsearch",children:[]},{value:"Check the data in Elasticsearch",id:"check-the-data-in-elasticsearch",children:[]},{value:"Query data from elasticsearch and create an output with Mistral",id:"query-data-from-elasticsearch-and-create-an-output-with-mistral",children:[]},{value:"Install libraries to extract text from pdfs",id:"install-libraries-to-extract-text-from-pdfs",children:[]},{value:"Extract data from CV and put it into Elasticsearch",id:"extract-data-from-cv-and-put-it-into-elasticsearch",children:[]}]},{value:"Free Data Engineering Course with AWS TDengine Docker and Grafana",id:"free-data-engineering-course-with-aws-tdengine-docker-and-grafana",children:[]},{value:"Monitor your data in dbt and detect quality issues with Elementary",id:"monitor-your-data-in-dbt-and-detect-quality-issues-with-elementary",children:[]},{value:"Solving Engineers 4 Biggest Airflow Problems",id:"solving-engineers-4-biggest-airflow-problems",children:[]},{value:"The best alternative to Airlfow? Mage.ai",id:"the-best-alternative-to-airlfow-mageai",children:[]}],c={toc:s};function d(e){var t=e.components,a=Object(i.a)(e,["components"]);return Object(o.b)("wrapper",Object(n.a)({},c,a,{components:t,mdxType:"MDXLayout"}),Object(o.b)("h1",{id:"data-engineering-course-building-a-data-platform"},"Data Engineering Course: Building A Data Platform"),Object(o.b)("h2",{id:"contents"},"Contents"),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},Object(o.b)("a",{parentName:"li",href:"/docs/04-HandsOnCourse#genai-retrieval-augmented-generation-with-ollama-and-elasticsearch"},"GenAI Retrieval Augmented Generation with Ollama and ElasticSearch")),Object(o.b)("li",{parentName:"ul"},Object(o.b)("a",{parentName:"li",href:"/docs/04-HandsOnCourse#free-data-engineering-course-with-aws-tdengine-docker-and-grafana"},"Free Data Engineering Course with AWS, TDengine, Docker and Grafana")),Object(o.b)("li",{parentName:"ul"},Object(o.b)("a",{parentName:"li",href:"/docs/04-HandsOnCourse#monitor-your-data-in-dbt-and-detect-quality-issues-with-elementary"},"Monitor your data in dbt & detect quality issues with Elementary")),Object(o.b)("li",{parentName:"ul"},Object(o.b)("a",{parentName:"li",href:"/docs/04-HandsOnCourse#solving-engineers-4-biggest-airflow-problems"},"Solving Engineers 4 Biggest Airflow Problems")),Object(o.b)("li",{parentName:"ul"},Object(o.b)("a",{parentName:"li",href:"/docs/04-HandsOnCourse#the-best-alternative-to-airlfow?-mage.ai"},"The best alternative to Airlfow? Mage.ai"))),Object(o.b)("h2",{id:"genai-retrieval-augmented-generation-with-ollama-and-elasticsearch"},"GenAI Retrieval Augmented Generation with Ollama and ElasticSearch"),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},"This how-to is based on this one from Elasticsearch: ",Object(o.b)("a",{parentName:"li",href:"https://www.elastic.co/search-labs/blog/rag-with-llamaIndex-and-elasticsearch"},"https://www.elastic.co/search-labs/blog/rag-with-llamaIndex-and-elasticsearch")),Object(o.b)("li",{parentName:"ul"},"Instead of Elasticsearch cloud we're going to run everything locally"),Object(o.b)("li",{parentName:"ul"},"The simplest way to get this done is to just clone this GitHub Repo for the code and docker setup"),Object(o.b)("li",{parentName:"ul"},"I've tried this on a M1 Mac. Changes for Windows with WSL will come later."),Object(o.b)("li",{parentName:"ul"},"The biggest problems that I had were actually installing the dependencies rather than the code itself.")),Object(o.b)("h3",{id:"install-ollama"},"Install Ollama"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},"Download Ollama from here ",Object(o.b)("a",{parentName:"li",href:"https://ollama.com/download/mac"},"https://ollama.com/download/mac")),Object(o.b)("li",{parentName:"ol"},"Unzip, drag into applications and install"),Object(o.b)("li",{parentName:"ol"},"do ",Object(o.b)("inlineCode",{parentName:"li"},"ollama run mistral")," (It's going to download the Mistral 7b model, 4.1GB size)"),Object(o.b)("li",{parentName:"ol"},'Create a new folder in Documents "Elasticsearch-RAG"'),Object(o.b)("li",{parentName:"ol"},"Open that folder in VSCode")),Object(o.b)("h3",{id:"install-elasticsearch--kibana-docker"},"Install Elasticsearch & Kibana (Docker)"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},"Use the docker-compose file from this repo: ",Object(o.b)("a",{parentName:"li",href:"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/docker-compose.yml"},"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/docker-compose.yml")),Object(o.b)("li",{parentName:"ol"},"Download Docker Desktop from here: ",Object(o.b)("a",{parentName:"li",href:"https://www.docker.com/products/docker-desktop/"},"https://www.docker.com/products/docker-desktop/")),Object(o.b)("li",{parentName:"ol"},"Install docker desktop and sign in in the app/create a user -> sends you to the browser")),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"For Windows Users"),"\nConfigure WSL2 to use max only 4GB of ram:"),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},'wsl --shutdown\nnotepad "$env:USERPROFILE/.wslconfig"\n')),Object(o.b)("p",null,".wslconfig file:"),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},"[wsl2]\nmemory=4GB   # Limits VM memory in WSL 2 up to 4GB\n")),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"Modify the Linux kernel map count in WSL"),"\nDo this before the start because Elasticsearch requires a higher value to work\n",Object(o.b)("inlineCode",{parentName:"p"},"sudo sysctl -w vm.max_map_count=262144")),Object(o.b)("ol",{start:4},Object(o.b)("li",{parentName:"ol"},"go to the Elasticsearch-RAG folder and do ",Object(o.b)("inlineCode",{parentName:"li"},"docker compose up")),Object(o.b)("li",{parentName:"ol"},"make sure you have Elasticsearch 8.11 or later (we use 8.16 here in this project) if you want to use your own Elasticsearch image"),Object(o.b)("li",{parentName:"ol"},"if you get this error on a mac then just open the console in the docker app: ",Object(o.b)("em",{parentName:"li"},"error getting credentials - err: exec: docker-credential-desktop: executable file not found in $PATH, out:")),Object(o.b)("li",{parentName:"ol"},"Install xcode command line tools: ",Object(o.b)("inlineCode",{parentName:"li"},"xcode-select --install")),Object(o.b)("li",{parentName:"ol"},"make sure you're at python 3.8.1 or larger -> installed 3.13.0 from ",Object(o.b)("a",{parentName:"li",href:"https://www.python.org/downloads/"},"https://www.python.org/downloads/"))),Object(o.b)("h3",{id:"setup-the-virtual-python-environment"},"Setup the virtual Python environment"),Object(o.b)("h4",{id:"preparation-on-a-mac"},"preparation on a Mac"),Object(o.b)("h5",{id:"install-brew"},"install brew"),Object(o.b)("p",null,'which brew\n/bin/bash -c "$(curl -fsSL ',Object(o.b)("a",{parentName:"p",href:"https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)%22"},'https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"'),'\nexport PATH="/opt/homebrew/bin:$PATH"\nbrew --version\nbrew install pyenv\nbrew install pyenv-virtualenv'),Object(o.b)("h5",{id:"install-pyenv"},"install pyenv"),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},"brew install pyenv\nbrew install pyenv-virtualenv\n")),Object(o.b)("p",null,"Modify the path so that pyenv is in the path variable\n",Object(o.b)("inlineCode",{parentName:"p"},"nano ~/.zshrc")),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},'export PYENV_ROOT="$HOME/.pyenv"\nexport PATH="$PYENV_ROOT/bin:$PATH"\neval "$(pyenv init --path)"\neval "$(pyenv init -)"\neval "$(pyenv virtualenv-init -)"\n')),Object(o.b)("p",null,"install dependencies for building python versions\n",Object(o.b)("inlineCode",{parentName:"p"},"brew install openssl readline sqlite3 xz zlib")),Object(o.b)("p",null,"Reload to apply changes\n",Object(o.b)("inlineCode",{parentName:"p"},"source ~/.zshrc")),Object(o.b)("p",null,"install python"),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},"pyenv install 3.11.6\npyenv version\n")),Object(o.b)("p",null,"Set Python version system wide\n",Object(o.b)("inlineCode",{parentName:"p"},"pyenv global 3.11.6")),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},"pyenv virtualenv <python-version> <new-virtualenv-name>\npyenv activate <your-virtualenv-name>\npyenv virtualenv-delete <your-virtualenv-name>\n")),Object(o.b)("h4",{id:"windows-without-pyenv"},"Windows without pyenv"),Object(o.b)("p",null,"setup virtual python environment - go to the Elasticsearch-RAG folder and do\n",Object(o.b)("inlineCode",{parentName:"p"},"python3 -m venv .elkrag"),"\nenable the environment\n",Object(o.b)("inlineCode",{parentName:"p"},"source .elkrag/bin/activate")),Object(o.b)("h3",{id:"install-required-libraries-do-one-at-a-time-so-you-see-errors"},"Install required libraries (do one at a time so you see errors):"),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},"pip install llama-index (optional python3 -m pip install package name)\npip install llama-index-embeddings-ollama\npip install llama-index-llms-ollama\npip install llama-index-vector-stores-elasticsearch\npip install python-dotenv\n")),Object(o.b)("h3",{id:"write-the-data-to-elasticsearch"},"Write the data to Elasticsearch"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},"create / copy in the index.py file"),Object(o.b)("li",{parentName:"ol"},"download the conversations.json file from the folder code examples/GenAI-RAG"),Object(o.b)("li",{parentName:"ol"},"if you get an error with the execution then check if pedantic version is <2.0 ",Object(o.b)("inlineCode",{parentName:"li"},"pip show pydantic")," if not do this: ",Object(o.b)("inlineCode",{parentName:"li"},'pip install "pydantic<2.0')),Object(o.b)("li",{parentName:"ol"},"run the program index.py: ",Object(o.b)("a",{parentName:"li",href:"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/index.py"},"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/index.py"))),Object(o.b)("h3",{id:"check-the-data-in-elasticsearch"},"Check the data in Elasticsearch"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},"go to kibana http://localhost:5601/app/management/data/index_management/indices and see the new index called calls"),Object(o.b)("li",{parentName:"ol"},"go to dev tools and try out this query ",Object(o.b)("inlineCode",{parentName:"li"},"GET calls/_search?size=1 http://localhost:5601/app/dev_tools#/console/shell"))),Object(o.b)("h3",{id:"query-data-from-elasticsearch-and-create-an-output-with-mistral"},"Query data from elasticsearch and create an output with Mistral"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},"if everything is good then run the query.py file ",Object(o.b)("a",{parentName:"li",href:"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/query.py"},"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/query.py")),Object(o.b)("li",{parentName:"ol"},"try a few queries :)")),Object(o.b)("h3",{id:"install-libraries-to-extract-text-from-pdfs"},"Install libraries to extract text from pdfs"),Object(o.b)("h3",{id:"extract-data-from-cv-and-put-it-into-elasticsearch"},"Extract data from CV and put it into Elasticsearch"),Object(o.b)("p",null,"I created a CV with ChatGPT ",Object(o.b)("a",{parentName:"p",href:"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/Liam_McGivney_CV.pdf"},"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/Liam_McGivney_CV.pdf")),Object(o.b)("p",null,"Install the library to extract text from the pdf\n",Object(o.b)("inlineCode",{parentName:"p"},"pip install PyMuPDF"),"\nI had to Shift+Command+p then python clear workspace cache and reload window. Then it saw it :/"),Object(o.b)("p",null,"The file cvpipeline.py has the python code for the indexing. It's not working right now though!\n",Object(o.b)("a",{parentName:"p",href:"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/cvpipeline.py"},"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/cvpipeline.py")),Object(o.b)("p",null,"I'll keep developing this and update it once it's working."),Object(o.b)("h2",{id:"free-data-engineering-course-with-aws-tdengine-docker-and-grafana"},"Free Data Engineering Course with AWS TDengine Docker and Grafana"),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"Free hands-on course:")," ",Object(o.b)("a",{parentName:"p",href:"https://youtu.be/eoj-CnrR9jA"},"Watch on YouTube")),Object(o.b)("p",null,"In this detailed tutorial video, Andreas guides viewers through creating an end-to-end data pipeline using time series data. The project focuses on fetching weather data from a Weather API, processing it on AWS, storing it in TDengine (a time series database), and visualizing the data with Grafana. Here's a concise summary of what the video covers:"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Introduction and Setup:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},"The project is introduced along with a GitHub repository containing all necessary resources and a step-by-step guide."),Object(o.b)("li",{parentName:"ul"},"The pipeline architecture includes an IoT weather station, a Weather API, AWS for processing, TDengine for data storage, and Grafana for visualization.")),Object(o.b)("ol",{start:2},Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Project Components:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Weather API:")," Utilizes weatherapi.com to fetch weather data."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"AWS Lambda:")," Processes the data fetched from the Weather API."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"TDengine:")," Serves as the time series database to store processed data. It's highlighted for its performance and simplicity, especially for time series data."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Grafana:")," Used for creating dashboards to visualize the time series data.")),Object(o.b)("ol",{start:3},Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Development and Deployment:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},"The local development environment setup includes Python, Docker, and VS Code."),Object(o.b)("li",{parentName:"ul"},"The tutorial covers the creation of a Docker image for the project and deploying it to AWS Elastic Container Registry (ECR)."),Object(o.b)("li",{parentName:"ul"},"AWS Lambda is then configured to use the Docker image from ECR."),Object(o.b)("li",{parentName:"ul"},"AWS EventBridge is used to schedule the Lambda function to run at specified intervals.")),Object(o.b)("ol",{start:4},Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Time Series Data:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},"The importance of time series data and the benefits of using a time series database like TDengine over traditional relational databases are discussed."),Object(o.b)("li",{parentName:"ul"},"TDengine's features such as speed, scaling, data retention, and built-in functions for time series data are highlighted.")),Object(o.b)("ol",{start:5},Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Building the Pipeline:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},"Detailed instructions are provided for setting up each component of the pipeline:",Object(o.b)("ul",{parentName:"li"},Object(o.b)("li",{parentName:"ul"},"Fetching weather data from the Weather API."),Object(o.b)("li",{parentName:"ul"},"Processing and sending the data to TDengine using an AWS Lambda function."),Object(o.b)("li",{parentName:"ul"},"Visualizing the data with Grafana."))),Object(o.b)("li",{parentName:"ul"},"Each step includes code snippets and configurations needed to implement the pipeline.")),Object(o.b)("ol",{start:6},Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Conclusion:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},"The video concludes with a demonstration of the completed pipeline, showing weather data visualizations in Grafana."),Object(o.b)("li",{parentName:"ul"},"Viewers are encouraged to replicate the project using the resources provided in the GitHub repository linked in the video description.")),Object(o.b)("p",null,"This video provides a comprehensive guide to building a data pipeline with a focus on time series data, demonstrating the integration of various technologies and platforms to achieve an end-to-end solution."),Object(o.b)("h2",{id:"monitor-your-data-in-dbt-and-detect-quality-issues-with-elementary"},"Monitor your data in dbt and detect quality issues with Elementary"),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"Free hands-on tutorial:")," ",Object(o.b)("a",{parentName:"p",href:"https://youtu.be/6fnU91Q2gq0"},"Watch on YouTube")),Object(o.b)("p",null,"In this comprehensive tutorial, Andreas delves into the integration of dbt (data build tool) with Elementary to enhance data monitoring and quality detection within Snowflake databases. The tutorial is structured to guide viewers through a hands-on experience, starting with an introduction to a sample project setup and the common challenges faced in monitoring dbt jobs. It then transitions into how Elementary can be utilized to address these challenges effectively."),Object(o.b)("p",null,"Key learning points and tutorial structure include:"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Introduction to the Sample Project:")," Andreas showcases a project setup involving Snowflake as the data warehouse, dbt for data modeling and testing, and a visualization tool for data analysis. This setup serves as the basis for the tutorial."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Challenges in Monitoring dbt Jobs:")," Common issues in monitoring dbt jobs are discussed, highlighting the limitations of the dbt interface in providing comprehensive monitoring capabilities."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Introduction to Elementary:")," Elementary is introduced as a dbt-native data observability tool designed to enhance the monitoring and analysis of dbt jobs. It offers both open-source and cloud versions, with the tutorial focusing on the cloud version."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Setup Requirements:")," The tutorial covers the necessary setup on both the Snowflake and dbt sides, including schema creation, user and role configuration in Snowflake, and modifications to the dbt project for integrating with Elementary."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Elementary's User Interface and Features:")," A thorough walkthrough of Elementary's interface is provided, showcasing its dashboard, test results, model runs, data catalog, and data lineage features. The tool's ability to automatically run additional tests, like anomaly detection and schema change detection, is also highlighted."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Advantages of Using Elementary:")," The presenter outlines several benefits of using Elementary, such as easy implementation, native test integration, clean and straightforward UI, and enhanced privacy due to data being stored within the user's data warehouse."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Potential Drawbacks:")," Some potential drawbacks are discussed, including the additional load on dbt job execution due to more models being run and limitations in dashboard customization."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Summary and Verdict:")," The tutorial concludes with a summary of the key features and benefits of using Elementary with dbt, emphasizing its value in improving data quality monitoring and detection.")),Object(o.b)("p",null,"Overall, viewers are guided through setting up and utilizing Elementary for dbt data monitoring, gaining insights into its capabilities, setup process, and the practical benefits it offers for data quality assurance."),Object(o.b)("h2",{id:"solving-engineers-4-biggest-airflow-problems"},"Solving Engineers 4 Biggest Airflow Problems"),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"Free hands-on tutorial:")," ",Object(o.b)("a",{parentName:"p",href:"https://youtu.be/b9bMNEh8bes"},"Watch on YouTube")),Object(o.b)("p",null,"In this informative video, Andreas discusses the four major challenges engineers face when working with Apache Airflow and introduces Astronomer, a managed Airflow service that addresses these issues effectively. Astronomer is highlighted as a solution that simplifies Airflow deployment and management, making it easier for engineers to develop, deploy, and monitor their data pipelines. Here's a summary of the key points discussed for each challenge and how Astronomer provides solutions:"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},"Managing Airflow Deployments:")),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Challenge:")," Setting up and maintaining Airflow deployments is complex and time-consuming, involving configuring cloud instances, managing resources, scaling, and updating the Airflow system."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Solution with Astronomer:")," Offers a straightforward deployment process where users can easily configure their deployments, choose cloud providers (GCP, AWS, Azure), and set up scaling with just a few clicks. Astronomer handles the complexity, making it easier to manage production and quality environments.")),Object(o.b)("ol",{start:2},Object(o.b)("li",{parentName:"ol"},"Development Environment and Deployment:")),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Challenge:")," Local installation of Airflow is complicated due to its dependency on multiple Docker containers and the need for extensive configuration."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Solution with Astronomer:")," Provides a CLI tool for setting up a local development environment with a single command, simplifying the process of developing, testing, and deploying pipelines. The Astronomer CLI also helps in initializing project templates and deploying Dags to the cloud effortlessly.")),Object(o.b)("ol",{start:3},Object(o.b)("li",{parentName:"ol"},"Source Code Management and CI/CD Pipelines:")),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Challenge:")," Collaborative development and continuous integration/deployment (CI/CD) are essential but challenging to implement effectively with Airflow alone."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Solution with Astronomer:")," Facilitates easy integration with GitHub for source code management and GitHub Actions for CI/CD. This allows automatic testing and deployment of pipeline code, ensuring a smooth workflow for teams working on pipeline development.")),Object(o.b)("ol",{start:4},Object(o.b)("li",{parentName:"ol"},"Observing Pipelines and Alarms:")),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Challenge:")," Monitoring data pipelines and getting timely alerts when issues occur is crucial but often difficult to achieve."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Solution with Astronomer:")," The Astronomer platform provides a user-friendly interface for monitoring pipeline status and performance. It also offers customizable alerts for failures or prolonged task durations, with notifications via email, PagerDuty, or Slack, ensuring immediate awareness and response to issues.")),Object(o.b)("p",null,"Overall, the video shows Astronomer as a powerful and user-friendly platform that addresses the common challenges of using Airflow, from deployment and development to collaboration, CI/CD, and monitoring. It suggests that Astronomer can significantly improve the experience of engineers working with Airflow, making it easier to manage, develop, and monitor data pipelines."),Object(o.b)("h2",{id:"the-best-alternative-to-airlfow-mageai"},"The best alternative to Airlfow? Mage.ai"),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"Free hands-on tutorial:")," ",Object(o.b)("a",{parentName:"p",href:"https://youtu.be/3gXsFEC3aYA"},"Watch on YouTube")),Object(o.b)("p",null,"In this insightful video, Andreas introduces Mage, a promising alternative to Apache Airflow, focusing on its simplicity, user-friendliness, and scalability. The video provides a comprehensive walkthrough of Mage, highlighting its key features and advantages over Airflow. Here's a breakdown of what viewers can learn and expect from the video:"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Deployment Ease:")," Mage offers a stark contrast to Airflow's complex setup process. It simplifies deployment to a single Docker image, making it straightforward to install and start on any machine, whether it's local or cloud-based on AWS, GCP, or Azure. This simplicity extends to scaling, which Mage handles horizontally, particularly beneficial in Kubernetes environments where performance scales with the number of pipelines."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"User Interface (UI):")," Mage shines with its UI, presenting a dark mode interface that's not only visually appealing but also simplifies navigation and pipeline management. The UI facilitates easy access to pipelines, scheduling, and monitoring of pipeline runs, offering a more intuitive experience compared to Airflow."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Pipeline Creation and Modification:")," Mage streamlines the creation of ETL pipelines, allowing users to easily add data loaders, transformers, and exporters through its UI. It supports direct interaction with APIs for data loading and provides a visual representation of the data flow, enhancing the overall pipeline design experience."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Data Visualization and Exploration:")," Beyond simple pipeline creation, Mage enables in-depth data exploration within the UI. Users can generate various charts, such as histograms and bar charts, to analyze the data directly, a feature that greatly enhances the tool's utility."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Testing and Scheduling:")," Testing pipelines in Mage is straightforward, allowing for quick integration of tests to ensure data quality and pipeline reliability. Scheduling is also versatile, supporting standard time-based triggers, event-based triggers for real-time data ingestion, and API calls for on-demand pipeline execution."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Support for Streaming and ELT Processes:")," Mage is not limited to ETL workflows but also supports streaming and ELT processes. It integrates seamlessly with DBT models for in-warehouse transformations and Spark for big data processing, showcasing its versatility and scalability."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Conclusion and Call to Action:")," Andreas concludes by praising the direction in which the industry is moving, with tools like Mage simplifying data engineering processes. He encourages viewers to try Mage and engage with the content by liking, subscribing, and commenting on their current tools and the potential impact of Mage.")),Object(o.b)("p",null,"Overall, the video shows Mage as a highly user-friendly, scalable, and versatile tool for data pipeline creation and management, offering a compelling alternative to traditional tools like Airflow."))}d.isMDXComponent=!0},88:function(e,t,a){"use strict";a.d(t,"a",(function(){return b})),a.d(t,"b",(function(){return m}));var n=a(0),i=a.n(n);function o(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function r(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,n)}return a}function l(e){for(var t=1;t<arguments.length;t++){var a=null!=arguments[t]?arguments[t]:{};t%2?r(Object(a),!0).forEach((function(t){o(e,t,a[t])})):Object.getOwnPropertyDescriptors?Object.defineProperties(e,Object.getOwnPropertyDescriptors(a)):r(Object(a)).forEach((function(t){Object.defineProperty(e,t,Object.getOwnPropertyDescriptor(a,t))}))}return e}function s(e,t){if(null==e)return{};var a,n,i=function(e,t){if(null==e)return{};var a,n,i={},o=Object.keys(e);for(n=0;n<o.length;n++)a=o[n],t.indexOf(a)>=0||(i[a]=e[a]);return i}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(n=0;n<o.length;n++)a=o[n],t.indexOf(a)>=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(i[a]=e[a])}return i}var c=i.a.createContext({}),d=function(e){var t=i.a.useContext(c),a=t;return e&&(a="function"==typeof e?e(t):l(l({},t),e)),a},b=function(e){var t=d(e.components);return i.a.createElement(c.Provider,{value:t},e.children)},p={inlineCode:"code",wrapper:function(e){var t=e.children;return i.a.createElement(i.a.Fragment,{},t)}},u=i.a.forwardRef((function(e,t){var a=e.components,n=e.mdxType,o=e.originalType,r=e.parentName,c=s(e,["components","mdxType","originalType","parentName"]),b=d(a),u=n,m=b["".concat(r,".").concat(u)]||b[u]||p[u]||o;return a?i.a.createElement(m,l(l({ref:t},c),{},{components:a})):i.a.createElement(m,l({ref:t},c))}));function m(e,t){var a=arguments,n=t&&t.mdxType;if("string"==typeof e||n){var o=a.length,r=new Array(o);r[0]=u;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:n,r[1]=l;for(var c=2;c<o;c++)r[c]=a[c];return i.a.createElement.apply(null,r)}return i.a.createElement.apply(null,a)}u.displayName="MDXCreateElement"}}]);
\ No newline at end of file
+(window.webpackJsonp=window.webpackJsonp||[]).push([[9],{77:function(e,t,a){"use strict";a.r(t),a.d(t,"frontMatter",(function(){return r})),a.d(t,"metadata",(function(){return l})),a.d(t,"toc",(function(){return s})),a.d(t,"default",(function(){return d}));var n=a(3),i=a(7),o=(a(0),a(88)),r={},l={unversionedId:"04-HandsOnCourse",id:"04-HandsOnCourse",isDocsHomePage:!1,title:"04-HandsOnCourse",description:"Data Engineering Course: Building A Data Platform",source:"@site/docs/04-HandsOnCourse.md",slug:"/04-HandsOnCourse",permalink:"/docs/04-HandsOnCourse",version:"current",sidebar:"projectsSidebar",previous:{title:"03-AdvancedSkills",permalink:"/docs/03-AdvancedSkills"},next:{title:"05-CaseStudies",permalink:"/docs/05-CaseStudies"}},s=[{value:"Contents",id:"contents",children:[]},{value:"GenAI Retrieval Augmented Generation with Ollama and Elasticsearch",id:"genai-retrieval-augmented-generation-with-ollama-and-elasticsearch",children:[{value:"Install Ollama",id:"install-ollama",children:[]},{value:"Install Elasticsearch &amp; Kibana (Docker)",id:"install-elasticsearch--kibana-docker",children:[]},{value:"Setup the virtual Python environment",id:"setup-the-virtual-python-environment",children:[]},{value:"Install required libraries (do one at a time so you see errors):",id:"install-required-libraries-do-one-at-a-time-so-you-see-errors",children:[]},{value:"Write the data to Elasticsearch",id:"write-the-data-to-elasticsearch",children:[]},{value:"Check the data in Elasticsearch",id:"check-the-data-in-elasticsearch",children:[]},{value:"Query data from elasticsearch and create an output with Mistral",id:"query-data-from-elasticsearch-and-create-an-output-with-mistral",children:[]},{value:"Install libraries to extract text from pdfs",id:"install-libraries-to-extract-text-from-pdfs",children:[]},{value:"Extract data from CV and put it into Elasticsearch",id:"extract-data-from-cv-and-put-it-into-elasticsearch",children:[]}]},{value:"Free Data Engineering Course with AWS TDengine Docker and Grafana",id:"free-data-engineering-course-with-aws-tdengine-docker-and-grafana",children:[]},{value:"Monitor your data in dbt and detect quality issues with Elementary",id:"monitor-your-data-in-dbt-and-detect-quality-issues-with-elementary",children:[]},{value:"Solving Engineers 4 Biggest Airflow Problems",id:"solving-engineers-4-biggest-airflow-problems",children:[]},{value:"The best alternative to Airlfow? Mage.ai",id:"the-best-alternative-to-airlfow-mageai",children:[]}],c={toc:s};function d(e){var t=e.components,a=Object(i.a)(e,["components"]);return Object(o.b)("wrapper",Object(n.a)({},c,a,{components:t,mdxType:"MDXLayout"}),Object(o.b)("h1",{id:"data-engineering-course-building-a-data-platform"},"Data Engineering Course: Building A Data Platform"),Object(o.b)("h2",{id:"contents"},"Contents"),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},Object(o.b)("a",{parentName:"li",href:"/docs/04-HandsOnCourse#genai-retrieval-augmented-generation-with-ollama-and-elasticsearch"},"GenAI Retrieval Augmented Generation with Ollama and Elasticsearch")),Object(o.b)("li",{parentName:"ul"},Object(o.b)("a",{parentName:"li",href:"/docs/04-HandsOnCourse#free-data-engineering-course-with-aws-tdengine-docker-and-grafana"},"Free Data Engineering Course with AWS, TDengine, Docker and Grafana")),Object(o.b)("li",{parentName:"ul"},Object(o.b)("a",{parentName:"li",href:"/docs/04-HandsOnCourse#monitor-your-data-in-dbt-and-detect-quality-issues-with-elementary"},"Monitor your data in dbt & detect quality issues with Elementary")),Object(o.b)("li",{parentName:"ul"},Object(o.b)("a",{parentName:"li",href:"/docs/04-HandsOnCourse#solving-engineers-4-biggest-airflow-problems"},"Solving Engineers 4 Biggest Airflow Problems")),Object(o.b)("li",{parentName:"ul"},Object(o.b)("a",{parentName:"li",href:"/docs/04-HandsOnCourse#the-best-alternative-to-airlfow?-mage.ai"},"The best alternative to Airlfow? Mage.ai"))),Object(o.b)("h2",{id:"genai-retrieval-augmented-generation-with-ollama-and-elasticsearch"},"GenAI Retrieval Augmented Generation with Ollama and Elasticsearch"),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},"This how-to is based on this one from Elasticsearch: ",Object(o.b)("a",{parentName:"li",href:"https://www.elastic.co/search-labs/blog/rag-with-llamaIndex-and-elasticsearch"},"https://www.elastic.co/search-labs/blog/rag-with-llamaIndex-and-elasticsearch")),Object(o.b)("li",{parentName:"ul"},"Instead of Elasticsearch cloud we're going to run everything locally"),Object(o.b)("li",{parentName:"ul"},"The simplest way to get this done is to just clone this GitHub Repo for the code and docker setup"),Object(o.b)("li",{parentName:"ul"},"I've tried this on a M1 Mac. Changes for Windows with WSL will come later."),Object(o.b)("li",{parentName:"ul"},"The biggest problems that I had were actually installing the dependencies rather than the code itself.")),Object(o.b)("h3",{id:"install-ollama"},"Install Ollama"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},"Download Ollama from here ",Object(o.b)("a",{parentName:"li",href:"https://ollama.com/download/mac"},"https://ollama.com/download/mac")),Object(o.b)("li",{parentName:"ol"},"Unzip, drag into applications and install"),Object(o.b)("li",{parentName:"ol"},"do ",Object(o.b)("inlineCode",{parentName:"li"},"ollama run mistral")," (It's going to download the Mistral 7b model, 4.1GB size)"),Object(o.b)("li",{parentName:"ol"},'Create a new folder in Documents "Elasticsearch-RAG"'),Object(o.b)("li",{parentName:"ol"},"Open that folder in VSCode")),Object(o.b)("h3",{id:"install-elasticsearch--kibana-docker"},"Install Elasticsearch & Kibana (Docker)"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},"Use the docker-compose file from this repo: ",Object(o.b)("a",{parentName:"li",href:"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/docker-compose.yml"},"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/docker-compose.yml")),Object(o.b)("li",{parentName:"ol"},"Download Docker Desktop from here: ",Object(o.b)("a",{parentName:"li",href:"https://www.docker.com/products/docker-desktop/"},"https://www.docker.com/products/docker-desktop/")),Object(o.b)("li",{parentName:"ol"},"Install docker desktop and sign in in the app/create a user -> sends you to the browser")),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"For Windows Users"),"\nConfigure WSL2 to use max only 4GB of ram:"),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},'wsl --shutdown\nnotepad "$env:USERPROFILE/.wslconfig"\n')),Object(o.b)("p",null,".wslconfig file:"),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},"[wsl2]\nmemory=4GB   # Limits VM memory in WSL 2 up to 4GB\n")),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"Modify the Linux kernel map count in WSL"),"\nDo this before the start because Elasticsearch requires a higher value to work\n",Object(o.b)("inlineCode",{parentName:"p"},"sudo sysctl -w vm.max_map_count=262144")),Object(o.b)("ol",{start:4},Object(o.b)("li",{parentName:"ol"},"go to the Elasticsearch-RAG folder and do ",Object(o.b)("inlineCode",{parentName:"li"},"docker compose up")),Object(o.b)("li",{parentName:"ol"},"make sure you have Elasticsearch 8.11 or later (we use 8.16 here in this project) if you want to use your own Elasticsearch image"),Object(o.b)("li",{parentName:"ol"},"if you get this error on a mac then just open the console in the docker app: ",Object(o.b)("em",{parentName:"li"},"error getting credentials - err: exec: docker-credential-desktop: executable file not found in $PATH, out:")),Object(o.b)("li",{parentName:"ol"},"Install xcode command line tools: ",Object(o.b)("inlineCode",{parentName:"li"},"xcode-select --install")),Object(o.b)("li",{parentName:"ol"},"make sure you're at python 3.8.1 or larger -> installed 3.13.0 from ",Object(o.b)("a",{parentName:"li",href:"https://www.python.org/downloads/"},"https://www.python.org/downloads/"))),Object(o.b)("h3",{id:"setup-the-virtual-python-environment"},"Setup the virtual Python environment"),Object(o.b)("h4",{id:"preparation-on-a-mac"},"preparation on a Mac"),Object(o.b)("h5",{id:"install-brew"},"install brew"),Object(o.b)("p",null,'which brew\n/bin/bash -c "$(curl -fsSL ',Object(o.b)("a",{parentName:"p",href:"https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)%22"},'https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"'),'\nexport PATH="/opt/homebrew/bin:$PATH"\nbrew --version\nbrew install pyenv\nbrew install pyenv-virtualenv'),Object(o.b)("h5",{id:"install-pyenv"},"install pyenv"),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},"brew install pyenv\nbrew install pyenv-virtualenv\n")),Object(o.b)("p",null,"Modify the path so that pyenv is in the path variable\n",Object(o.b)("inlineCode",{parentName:"p"},"nano ~/.zshrc")),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},'export PYENV_ROOT="$HOME/.pyenv"\nexport PATH="$PYENV_ROOT/bin:$PATH"\neval "$(pyenv init --path)"\neval "$(pyenv init -)"\neval "$(pyenv virtualenv-init -)"\n')),Object(o.b)("p",null,"install dependencies for building python versions\n",Object(o.b)("inlineCode",{parentName:"p"},"brew install openssl readline sqlite3 xz zlib")),Object(o.b)("p",null,"Reload to apply changes\n",Object(o.b)("inlineCode",{parentName:"p"},"source ~/.zshrc")),Object(o.b)("p",null,"install python"),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},"pyenv install 3.11.6\npyenv version\n")),Object(o.b)("p",null,"Set Python version system wide\n",Object(o.b)("inlineCode",{parentName:"p"},"pyenv global 3.11.6")),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},"pyenv virtualenv <python-version> <new-virtualenv-name>\npyenv activate <your-virtualenv-name>\npyenv virtualenv-delete <your-virtualenv-name>\n")),Object(o.b)("h4",{id:"windows-without-pyenv"},"Windows without pyenv"),Object(o.b)("p",null,"setup virtual python environment - go to the Elasticsearch-RAG folder and do\n",Object(o.b)("inlineCode",{parentName:"p"},"python3 -m venv .elkrag"),"\nenable the environment\n",Object(o.b)("inlineCode",{parentName:"p"},"source .elkrag/bin/activate")),Object(o.b)("h3",{id:"install-required-libraries-do-one-at-a-time-so-you-see-errors"},"Install required libraries (do one at a time so you see errors):"),Object(o.b)("pre",null,Object(o.b)("code",{parentName:"pre"},"pip install llama-index (optional python3 -m pip install package name)\npip install llama-index-embeddings-ollama\npip install llama-index-llms-ollama\npip install llama-index-vector-stores-elasticsearch\npip install python-dotenv\n")),Object(o.b)("h3",{id:"write-the-data-to-elasticsearch"},"Write the data to Elasticsearch"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},"create / copy in the index.py file"),Object(o.b)("li",{parentName:"ol"},"download the conversations.json file from the folder code examples/GenAI-RAG"),Object(o.b)("li",{parentName:"ol"},"if you get an error with the execution then check if pedantic version is <2.0 ",Object(o.b)("inlineCode",{parentName:"li"},"pip show pydantic")," if not do this: ",Object(o.b)("inlineCode",{parentName:"li"},'pip install "pydantic<2.0')),Object(o.b)("li",{parentName:"ol"},"run the program index.py: ",Object(o.b)("a",{parentName:"li",href:"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/index.py"},"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/index.py"))),Object(o.b)("h3",{id:"check-the-data-in-elasticsearch"},"Check the data in Elasticsearch"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},"go to kibana http://localhost:5601/app/management/data/index_management/indices and see the new index called calls"),Object(o.b)("li",{parentName:"ol"},"go to dev tools and try out this query ",Object(o.b)("inlineCode",{parentName:"li"},"GET calls/_search?size=1 http://localhost:5601/app/dev_tools#/console/shell"))),Object(o.b)("h3",{id:"query-data-from-elasticsearch-and-create-an-output-with-mistral"},"Query data from elasticsearch and create an output with Mistral"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},"if everything is good then run the query.py file ",Object(o.b)("a",{parentName:"li",href:"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/query.py"},"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/query.py")),Object(o.b)("li",{parentName:"ol"},"try a few queries :)")),Object(o.b)("h3",{id:"install-libraries-to-extract-text-from-pdfs"},"Install libraries to extract text from pdfs"),Object(o.b)("h3",{id:"extract-data-from-cv-and-put-it-into-elasticsearch"},"Extract data from CV and put it into Elasticsearch"),Object(o.b)("p",null,"I created a CV with ChatGPT ",Object(o.b)("a",{parentName:"p",href:"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/Liam_McGivney_CV.pdf"},"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/Liam_McGivney_CV.pdf")),Object(o.b)("p",null,"Install the library to extract text from the pdf\n",Object(o.b)("inlineCode",{parentName:"p"},"pip install PyMuPDF"),"\nI had to Shift+Command+p then python clear workspace cache and reload window. Then it saw it :/"),Object(o.b)("p",null,"The file cvpipeline.py has the python code for the indexing. It's not working right now though!\n",Object(o.b)("a",{parentName:"p",href:"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/cvpipeline.py"},"https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/cvpipeline.py")),Object(o.b)("p",null,"I'll keep developing this and update it once it's working."),Object(o.b)("h2",{id:"free-data-engineering-course-with-aws-tdengine-docker-and-grafana"},"Free Data Engineering Course with AWS TDengine Docker and Grafana"),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"Free hands-on course:")," ",Object(o.b)("a",{parentName:"p",href:"https://youtu.be/eoj-CnrR9jA"},"Watch on YouTube")),Object(o.b)("p",null,"In this detailed tutorial video, Andreas guides viewers through creating an end-to-end data pipeline using time series data. The project focuses on fetching weather data from a Weather API, processing it on AWS, storing it in TDengine (a time series database), and visualizing the data with Grafana. Here's a concise summary of what the video covers:"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Introduction and Setup:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},"The project is introduced along with a GitHub repository containing all necessary resources and a step-by-step guide."),Object(o.b)("li",{parentName:"ul"},"The pipeline architecture includes an IoT weather station, a Weather API, AWS for processing, TDengine for data storage, and Grafana for visualization.")),Object(o.b)("ol",{start:2},Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Project Components:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Weather API:")," Utilizes weatherapi.com to fetch weather data."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"AWS Lambda:")," Processes the data fetched from the Weather API."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"TDengine:")," Serves as the time series database to store processed data. It's highlighted for its performance and simplicity, especially for time series data."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Grafana:")," Used for creating dashboards to visualize the time series data.")),Object(o.b)("ol",{start:3},Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Development and Deployment:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},"The local development environment setup includes Python, Docker, and VS Code."),Object(o.b)("li",{parentName:"ul"},"The tutorial covers the creation of a Docker image for the project and deploying it to AWS Elastic Container Registry (ECR)."),Object(o.b)("li",{parentName:"ul"},"AWS Lambda is then configured to use the Docker image from ECR."),Object(o.b)("li",{parentName:"ul"},"AWS EventBridge is used to schedule the Lambda function to run at specified intervals.")),Object(o.b)("ol",{start:4},Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Time Series Data:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},"The importance of time series data and the benefits of using a time series database like TDengine over traditional relational databases are discussed."),Object(o.b)("li",{parentName:"ul"},"TDengine's features such as speed, scaling, data retention, and built-in functions for time series data are highlighted.")),Object(o.b)("ol",{start:5},Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Building the Pipeline:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},"Detailed instructions are provided for setting up each component of the pipeline:",Object(o.b)("ul",{parentName:"li"},Object(o.b)("li",{parentName:"ul"},"Fetching weather data from the Weather API."),Object(o.b)("li",{parentName:"ul"},"Processing and sending the data to TDengine using an AWS Lambda function."),Object(o.b)("li",{parentName:"ul"},"Visualizing the data with Grafana."))),Object(o.b)("li",{parentName:"ul"},"Each step includes code snippets and configurations needed to implement the pipeline.")),Object(o.b)("ol",{start:6},Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Conclusion:"))),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},"The video concludes with a demonstration of the completed pipeline, showing weather data visualizations in Grafana."),Object(o.b)("li",{parentName:"ul"},"Viewers are encouraged to replicate the project using the resources provided in the GitHub repository linked in the video description.")),Object(o.b)("p",null,"This video provides a comprehensive guide to building a data pipeline with a focus on time series data, demonstrating the integration of various technologies and platforms to achieve an end-to-end solution."),Object(o.b)("h2",{id:"monitor-your-data-in-dbt-and-detect-quality-issues-with-elementary"},"Monitor your data in dbt and detect quality issues with Elementary"),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"Free hands-on tutorial:")," ",Object(o.b)("a",{parentName:"p",href:"https://youtu.be/6fnU91Q2gq0"},"Watch on YouTube")),Object(o.b)("p",null,"In this comprehensive tutorial, Andreas delves into the integration of dbt (data build tool) with Elementary to enhance data monitoring and quality detection within Snowflake databases. The tutorial is structured to guide viewers through a hands-on experience, starting with an introduction to a sample project setup and the common challenges faced in monitoring dbt jobs. It then transitions into how Elementary can be utilized to address these challenges effectively."),Object(o.b)("p",null,"Key learning points and tutorial structure include:"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Introduction to the Sample Project:")," Andreas showcases a project setup involving Snowflake as the data warehouse, dbt for data modeling and testing, and a visualization tool for data analysis. This setup serves as the basis for the tutorial."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Challenges in Monitoring dbt Jobs:")," Common issues in monitoring dbt jobs are discussed, highlighting the limitations of the dbt interface in providing comprehensive monitoring capabilities."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Introduction to Elementary:")," Elementary is introduced as a dbt-native data observability tool designed to enhance the monitoring and analysis of dbt jobs. It offers both open-source and cloud versions, with the tutorial focusing on the cloud version."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Setup Requirements:")," The tutorial covers the necessary setup on both the Snowflake and dbt sides, including schema creation, user and role configuration in Snowflake, and modifications to the dbt project for integrating with Elementary."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Elementary's User Interface and Features:")," A thorough walkthrough of Elementary's interface is provided, showcasing its dashboard, test results, model runs, data catalog, and data lineage features. The tool's ability to automatically run additional tests, like anomaly detection and schema change detection, is also highlighted."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Advantages of Using Elementary:")," The presenter outlines several benefits of using Elementary, such as easy implementation, native test integration, clean and straightforward UI, and enhanced privacy due to data being stored within the user's data warehouse."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Potential Drawbacks:")," Some potential drawbacks are discussed, including the additional load on dbt job execution due to more models being run and limitations in dashboard customization."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Summary and Verdict:")," The tutorial concludes with a summary of the key features and benefits of using Elementary with dbt, emphasizing its value in improving data quality monitoring and detection.")),Object(o.b)("p",null,"Overall, viewers are guided through setting up and utilizing Elementary for dbt data monitoring, gaining insights into its capabilities, setup process, and the practical benefits it offers for data quality assurance."),Object(o.b)("h2",{id:"solving-engineers-4-biggest-airflow-problems"},"Solving Engineers 4 Biggest Airflow Problems"),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"Free hands-on tutorial:")," ",Object(o.b)("a",{parentName:"p",href:"https://youtu.be/b9bMNEh8bes"},"Watch on YouTube")),Object(o.b)("p",null,"In this informative video, Andreas discusses the four major challenges engineers face when working with Apache Airflow and introduces Astronomer, a managed Airflow service that addresses these issues effectively. Astronomer is highlighted as a solution that simplifies Airflow deployment and management, making it easier for engineers to develop, deploy, and monitor their data pipelines. Here's a summary of the key points discussed for each challenge and how Astronomer provides solutions:"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},"Managing Airflow Deployments:")),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Challenge:")," Setting up and maintaining Airflow deployments is complex and time-consuming, involving configuring cloud instances, managing resources, scaling, and updating the Airflow system."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Solution with Astronomer:")," Offers a straightforward deployment process where users can easily configure their deployments, choose cloud providers (GCP, AWS, Azure), and set up scaling with just a few clicks. Astronomer handles the complexity, making it easier to manage production and quality environments.")),Object(o.b)("ol",{start:2},Object(o.b)("li",{parentName:"ol"},"Development Environment and Deployment:")),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Challenge:")," Local installation of Airflow is complicated due to its dependency on multiple Docker containers and the need for extensive configuration."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Solution with Astronomer:")," Provides a CLI tool for setting up a local development environment with a single command, simplifying the process of developing, testing, and deploying pipelines. The Astronomer CLI also helps in initializing project templates and deploying Dags to the cloud effortlessly.")),Object(o.b)("ol",{start:3},Object(o.b)("li",{parentName:"ol"},"Source Code Management and CI/CD Pipelines:")),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Challenge:")," Collaborative development and continuous integration/deployment (CI/CD) are essential but challenging to implement effectively with Airflow alone."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Solution with Astronomer:")," Facilitates easy integration with GitHub for source code management and GitHub Actions for CI/CD. This allows automatic testing and deployment of pipeline code, ensuring a smooth workflow for teams working on pipeline development.")),Object(o.b)("ol",{start:4},Object(o.b)("li",{parentName:"ol"},"Observing Pipelines and Alarms:")),Object(o.b)("ul",null,Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Challenge:")," Monitoring data pipelines and getting timely alerts when issues occur is crucial but often difficult to achieve."),Object(o.b)("li",{parentName:"ul"},Object(o.b)("strong",{parentName:"li"},"Solution with Astronomer:")," The Astronomer platform provides a user-friendly interface for monitoring pipeline status and performance. It also offers customizable alerts for failures or prolonged task durations, with notifications via email, PagerDuty, or Slack, ensuring immediate awareness and response to issues.")),Object(o.b)("p",null,"Overall, the video shows Astronomer as a powerful and user-friendly platform that addresses the common challenges of using Airflow, from deployment and development to collaboration, CI/CD, and monitoring. It suggests that Astronomer can significantly improve the experience of engineers working with Airflow, making it easier to manage, develop, and monitor data pipelines."),Object(o.b)("h2",{id:"the-best-alternative-to-airlfow-mageai"},"The best alternative to Airlfow? Mage.ai"),Object(o.b)("p",null,Object(o.b)("strong",{parentName:"p"},"Free hands-on tutorial:")," ",Object(o.b)("a",{parentName:"p",href:"https://youtu.be/3gXsFEC3aYA"},"Watch on YouTube")),Object(o.b)("p",null,"In this insightful video, Andreas introduces Mage, a promising alternative to Apache Airflow, focusing on its simplicity, user-friendliness, and scalability. The video provides a comprehensive walkthrough of Mage, highlighting its key features and advantages over Airflow. Here's a breakdown of what viewers can learn and expect from the video:"),Object(o.b)("ol",null,Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Deployment Ease:")," Mage offers a stark contrast to Airflow's complex setup process. It simplifies deployment to a single Docker image, making it straightforward to install and start on any machine, whether it's local or cloud-based on AWS, GCP, or Azure. This simplicity extends to scaling, which Mage handles horizontally, particularly beneficial in Kubernetes environments where performance scales with the number of pipelines."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"User Interface (UI):")," Mage shines with its UI, presenting a dark mode interface that's not only visually appealing but also simplifies navigation and pipeline management. The UI facilitates easy access to pipelines, scheduling, and monitoring of pipeline runs, offering a more intuitive experience compared to Airflow."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Pipeline Creation and Modification:")," Mage streamlines the creation of ETL pipelines, allowing users to easily add data loaders, transformers, and exporters through its UI. It supports direct interaction with APIs for data loading and provides a visual representation of the data flow, enhancing the overall pipeline design experience."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Data Visualization and Exploration:")," Beyond simple pipeline creation, Mage enables in-depth data exploration within the UI. Users can generate various charts, such as histograms and bar charts, to analyze the data directly, a feature that greatly enhances the tool's utility."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Testing and Scheduling:")," Testing pipelines in Mage is straightforward, allowing for quick integration of tests to ensure data quality and pipeline reliability. Scheduling is also versatile, supporting standard time-based triggers, event-based triggers for real-time data ingestion, and API calls for on-demand pipeline execution."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Support for Streaming and ELT Processes:")," Mage is not limited to ETL workflows but also supports streaming and ELT processes. It integrates seamlessly with DBT models for in-warehouse transformations and Spark for big data processing, showcasing its versatility and scalability."),Object(o.b)("li",{parentName:"ol"},Object(o.b)("strong",{parentName:"li"},"Conclusion and Call to Action:")," Andreas concludes by praising the direction in which the industry is moving, with tools like Mage simplifying data engineering processes. He encourages viewers to try Mage and engage with the content by liking, subscribing, and commenting on their current tools and the potential impact of Mage.")),Object(o.b)("p",null,"Overall, the video shows Mage as a highly user-friendly, scalable, and versatile tool for data pipeline creation and management, offering a compelling alternative to traditional tools like Airflow."))}d.isMDXComponent=!0},88:function(e,t,a){"use strict";a.d(t,"a",(function(){return b})),a.d(t,"b",(function(){return m}));var n=a(0),i=a.n(n);function o(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function r(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,n)}return a}function l(e){for(var t=1;t<arguments.length;t++){var a=null!=arguments[t]?arguments[t]:{};t%2?r(Object(a),!0).forEach((function(t){o(e,t,a[t])})):Object.getOwnPropertyDescriptors?Object.defineProperties(e,Object.getOwnPropertyDescriptors(a)):r(Object(a)).forEach((function(t){Object.defineProperty(e,t,Object.getOwnPropertyDescriptor(a,t))}))}return e}function s(e,t){if(null==e)return{};var a,n,i=function(e,t){if(null==e)return{};var a,n,i={},o=Object.keys(e);for(n=0;n<o.length;n++)a=o[n],t.indexOf(a)>=0||(i[a]=e[a]);return i}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(n=0;n<o.length;n++)a=o[n],t.indexOf(a)>=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(i[a]=e[a])}return i}var c=i.a.createContext({}),d=function(e){var t=i.a.useContext(c),a=t;return e&&(a="function"==typeof e?e(t):l(l({},t),e)),a},b=function(e){var t=d(e.components);return i.a.createElement(c.Provider,{value:t},e.children)},p={inlineCode:"code",wrapper:function(e){var t=e.children;return i.a.createElement(i.a.Fragment,{},t)}},u=i.a.forwardRef((function(e,t){var a=e.components,n=e.mdxType,o=e.originalType,r=e.parentName,c=s(e,["components","mdxType","originalType","parentName"]),b=d(a),u=n,m=b["".concat(r,".").concat(u)]||b[u]||p[u]||o;return a?i.a.createElement(m,l(l({ref:t},c),{},{components:a})):i.a.createElement(m,l({ref:t},c))}));function m(e,t){var a=arguments,n=t&&t.mdxType;if("string"==typeof e||n){var o=a.length,r=new Array(o);r[0]=u;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:n,r[1]=l;for(var c=2;c<o;c++)r[c]=a[c];return i.a.createElement.apply(null,r)}return i.a.createElement.apply(null,a)}u.displayName="MDXCreateElement"}}]);
\ No newline at end of file
diff --git a/assets/js/runtime~main.33a05ad1.js b/assets/js/runtime~main.96f9bfc8.js
similarity index 97%
rename from assets/js/runtime~main.33a05ad1.js
rename to assets/js/runtime~main.96f9bfc8.js
index 5cf8526..5c329f9 100644
--- a/assets/js/runtime~main.33a05ad1.js
+++ b/assets/js/runtime~main.96f9bfc8.js
@@ -1 +1 @@
-!function(e){function r(r){for(var n,c,a=r[0],u=r[1],d=r[2],i=0,s=[];i<a.length;i++)c=a[i],Object.prototype.hasOwnProperty.call(o,c)&&o[c]&&s.push(o[c][0]),o[c]=0;for(n in u)Object.prototype.hasOwnProperty.call(u,n)&&(e[n]=u[n]);for(l&&l(r);s.length;)s.shift()();return f.push.apply(f,d||[]),t()}function t(){for(var e,r=0;r<f.length;r++){for(var t=f[r],n=!0,c=1;c<t.length;c++){var u=t[c];0!==o[u]&&(n=!1)}n&&(f.splice(r--,1),e=a(a.s=t[0]))}return e}var n={},o={17:0},f=[];function c(e){return a.p+"assets/js/"+({3:"0a15610b",4:"17896441",5:"3fe089e3",6:"812cad9d",7:"935f2afb",8:"a4b6b237",9:"b42f3af8",10:"b63d0246",11:"c02becdf",12:"c12bc2d7",13:"c4f5d8e4",14:"d60411c0",15:"f5648033"}[e]||e)+"."+{1:"4743d043",2:"d8ee9dd6",3:"efd574fd",4:"aacbb830",5:"5039dbf7",6:"94d78d60",7:"cff03dc6",8:"47726173",9:"d55ebf91",10:"5653129e",11:"dfb73e46",12:"5d8a35c3",13:"71667c1b",14:"fbccf315",15:"dff97825",18:"28e25878",19:"f0bdf1f6"}[e]+".js"}function a(r){if(n[r])return n[r].exports;var t=n[r]={i:r,l:!1,exports:{}};return e[r].call(t.exports,t,t.exports,a),t.l=!0,t.exports}a.e=function(e){var r=[],t=o[e];if(0!==t)if(t)r.push(t[2]);else{var n=new Promise((function(r,n){t=o[e]=[r,n]}));r.push(t[2]=n);var f,u=document.createElement("script");u.charset="utf-8",u.timeout=120,a.nc&&u.setAttribute("nonce",a.nc),u.src=c(e);var d=new Error;f=function(r){u.onerror=u.onload=null,clearTimeout(i);var t=o[e];if(0!==t){if(t){var n=r&&("load"===r.type?"missing":r.type),f=r&&r.target&&r.target.src;d.message="Loading chunk "+e+" failed.\n("+n+": "+f+")",d.name="ChunkLoadError",d.type=n,d.request=f,t[1](d)}o[e]=void 0}};var i=setTimeout((function(){f({type:"timeout",target:u})}),12e4);u.onerror=u.onload=f,document.head.appendChild(u)}return Promise.all(r)},a.m=e,a.c=n,a.d=function(e,r,t){a.o(e,r)||Object.defineProperty(e,r,{enumerable:!0,get:t})},a.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},a.t=function(e,r){if(1&r&&(e=a(e)),8&r)return e;if(4&r&&"object"==typeof e&&e&&e.__esModule)return e;var t=Object.create(null);if(a.r(t),Object.defineProperty(t,"default",{enumerable:!0,value:e}),2&r&&"string"!=typeof e)for(var n in e)a.d(t,n,function(r){return e[r]}.bind(null,n));return t},a.n=function(e){var r=e&&e.__esModule?function(){return e.default}:function(){return e};return a.d(r,"a",r),r},a.o=function(e,r){return Object.prototype.hasOwnProperty.call(e,r)},a.p="/",a.gca=function(e){return c(e={17896441:"4","0a15610b":"3","3fe089e3":"5","812cad9d":"6","935f2afb":"7",a4b6b237:"8",b42f3af8:"9",b63d0246:"10",c02becdf:"11",c12bc2d7:"12",c4f5d8e4:"13",d60411c0:"14",f5648033:"15"}[e]||e)},a.oe=function(e){throw console.error(e),e};var u=window.webpackJsonp=window.webpackJsonp||[],d=u.push.bind(u);u.push=r,u=u.slice();for(var i=0;i<u.length;i++)r(u[i]);var l=d;t()}([]);
\ No newline at end of file
+!function(e){function r(r){for(var n,c,a=r[0],u=r[1],d=r[2],i=0,s=[];i<a.length;i++)c=a[i],Object.prototype.hasOwnProperty.call(o,c)&&o[c]&&s.push(o[c][0]),o[c]=0;for(n in u)Object.prototype.hasOwnProperty.call(u,n)&&(e[n]=u[n]);for(l&&l(r);s.length;)s.shift()();return f.push.apply(f,d||[]),t()}function t(){for(var e,r=0;r<f.length;r++){for(var t=f[r],n=!0,c=1;c<t.length;c++){var u=t[c];0!==o[u]&&(n=!1)}n&&(f.splice(r--,1),e=a(a.s=t[0]))}return e}var n={},o={17:0},f=[];function c(e){return a.p+"assets/js/"+({3:"0a15610b",4:"17896441",5:"3fe089e3",6:"812cad9d",7:"935f2afb",8:"a4b6b237",9:"b42f3af8",10:"b63d0246",11:"c02becdf",12:"c12bc2d7",13:"c4f5d8e4",14:"d60411c0",15:"f5648033"}[e]||e)+"."+{1:"4743d043",2:"d8ee9dd6",3:"efd574fd",4:"aacbb830",5:"5039dbf7",6:"94d78d60",7:"cff03dc6",8:"394c5a42",9:"f460278c",10:"5653129e",11:"dfb73e46",12:"5d8a35c3",13:"71667c1b",14:"fbccf315",15:"dff97825",18:"28e25878",19:"f0bdf1f6"}[e]+".js"}function a(r){if(n[r])return n[r].exports;var t=n[r]={i:r,l:!1,exports:{}};return e[r].call(t.exports,t,t.exports,a),t.l=!0,t.exports}a.e=function(e){var r=[],t=o[e];if(0!==t)if(t)r.push(t[2]);else{var n=new Promise((function(r,n){t=o[e]=[r,n]}));r.push(t[2]=n);var f,u=document.createElement("script");u.charset="utf-8",u.timeout=120,a.nc&&u.setAttribute("nonce",a.nc),u.src=c(e);var d=new Error;f=function(r){u.onerror=u.onload=null,clearTimeout(i);var t=o[e];if(0!==t){if(t){var n=r&&("load"===r.type?"missing":r.type),f=r&&r.target&&r.target.src;d.message="Loading chunk "+e+" failed.\n("+n+": "+f+")",d.name="ChunkLoadError",d.type=n,d.request=f,t[1](d)}o[e]=void 0}};var i=setTimeout((function(){f({type:"timeout",target:u})}),12e4);u.onerror=u.onload=f,document.head.appendChild(u)}return Promise.all(r)},a.m=e,a.c=n,a.d=function(e,r,t){a.o(e,r)||Object.defineProperty(e,r,{enumerable:!0,get:t})},a.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},a.t=function(e,r){if(1&r&&(e=a(e)),8&r)return e;if(4&r&&"object"==typeof e&&e&&e.__esModule)return e;var t=Object.create(null);if(a.r(t),Object.defineProperty(t,"default",{enumerable:!0,value:e}),2&r&&"string"!=typeof e)for(var n in e)a.d(t,n,function(r){return e[r]}.bind(null,n));return t},a.n=function(e){var r=e&&e.__esModule?function(){return e.default}:function(){return e};return a.d(r,"a",r),r},a.o=function(e,r){return Object.prototype.hasOwnProperty.call(e,r)},a.p="/",a.gca=function(e){return c(e={17896441:"4","0a15610b":"3","3fe089e3":"5","812cad9d":"6","935f2afb":"7",a4b6b237:"8",b42f3af8:"9",b63d0246:"10",c02becdf:"11",c12bc2d7:"12",c4f5d8e4:"13",d60411c0:"14",f5648033:"15"}[e]||e)},a.oe=function(e){throw console.error(e),e};var u=window.webpackJsonp=window.webpackJsonp||[],d=u.push.bind(u);u.push=r,u=u.slice();for(var i=0;i<u.length;i++)r(u[i]);var l=d;t()}([]);
\ No newline at end of file
diff --git a/docs/01-Introduction/index.html b/docs/01-Introduction/index.html
index 6979a06..f814443 100644
--- a/docs/01-Introduction/index.html
+++ b/docs/01-Introduction/index.html
@@ -7,7 +7,7 @@
 <link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="THE DATA ENGINEERING COOKBOOK Blog RSS Feed">
 <link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="THE DATA ENGINEERING COOKBOOK Blog Atom Feed"><title data-react-helmet="true">01-Introduction | THE DATA ENGINEERING COOKBOOK</title><meta data-react-helmet="true" name="twitter:card" content="summary_large_image"><meta data-react-helmet="true" name="docusaurus_locale" content="en"><meta data-react-helmet="true" name="docusaurus_version" content="current"><meta data-react-helmet="true" name="docusaurus_tag" content="docs-default-current"><meta data-react-helmet="true" property="og:title" content="01-Introduction | THE DATA ENGINEERING COOKBOOK"><meta data-react-helmet="true" name="description" content="Introduction"><meta data-react-helmet="true" property="og:description" content="Introduction"><meta data-react-helmet="true" property="og:url" content="http://cookbook.learndataengineering.com/docs/01-Introduction"><link data-react-helmet="true" rel="shortcut icon" href="/images/CookbookCover.jpg"><link data-react-helmet="true" rel="alternate" href="http://cookbook.learndataengineering.com/docs/01-Introduction" hreflang="x-default"><link data-react-helmet="true" rel="canonical" href="http://cookbook.learndataengineering.com/docs/01-Introduction"><link rel="stylesheet" href="/assets/css/styles.5ba6ab3f.css">
 <link rel="preload" href="/assets/js/styles.4aacfb65.js" as="script">
-<link rel="preload" href="/assets/js/runtime~main.33a05ad1.js" as="script">
+<link rel="preload" href="/assets/js/runtime~main.96f9bfc8.js" as="script">
 <link rel="preload" href="/assets/js/main.da980c98.js" as="script">
 <link rel="preload" href="/assets/js/1.4743d043.js" as="script">
 <link rel="preload" href="/assets/js/2.d8ee9dd6.js" as="script">
@@ -138,7 +138,7 @@
 | In part 3 I focused on everything regarding Leadership and Communication: team management, project management, collaboration, problem solving, strategic thinking, communication and leadership
 | <a href="https://youtube.com/live/DMumpzSyRjI" target="_blank" rel="noopener noreferrer">Watch on YouTube</a>|</p><p><img alt="Watch on YouTube" src="/assets/images/Becoming-a-Senior-Data-Engineer-Video-3-7ccc30e8bf245524a44691adb1ac0b33.jpg"></p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="final-thoughts"></a>Final Thoughts<a class="hash-link" href="#final-thoughts" title="Direct link to heading">#</a></h3><p>The path to becoming a senior data engineer is both challenging and rewarding. It requires a blend of technical prowess, continuous learning, and the development of soft skills that enable you to lead and innovate. Whether you&#x27;re just starting out or looking to advance your career, focusing on the key areas outlined above will set you on the right path.</p></div></article><div class="margin-vert--lg"><nav class="pagination-nav" aria-label="Blog list page navigation"><div class="pagination-nav__item"></div><div class="pagination-nav__item pagination-nav__item--next"><a class="pagination-nav__link" href="/docs/02-BasicSkills"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">02-BasicSkills »</div></a></div></nav></div></div></div><div class="col col--3"><div class="tableOfContents_35-E thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#contents" class="table-of-contents__link">Contents</a></li><li><a href="#what-is-this-cookbook" class="table-of-contents__link">What is this Cookbook</a></li><li><a href="#if-you-like-this-book--need-more-help" class="table-of-contents__link">If You Like This Book &amp; Need More Help:</a></li><li><a href="#support-this-book-for-free" class="table-of-contents__link">Support This Book For Free!</a></li><li><a href="#how-to-contribute" class="table-of-contents__link">How To Contribute</a></li><li><a href="#data-engineers" class="table-of-contents__link">Data Engineers</a></li><li><a href="#my-data-science-platform-blueprint" class="table-of-contents__link">My Data Science Platform Blueprint</a><ul><li><a href="#connect" class="table-of-contents__link">Connect</a></li><li><a href="#buffer" class="table-of-contents__link">Buffer</a></li><li><a href="#processing-framework" class="table-of-contents__link">Processing Framework</a></li><li><a href="#store" class="table-of-contents__link">Store</a></li><li><a href="#visualize" class="table-of-contents__link">Visualize</a></li></ul></li><li><a href="#who-companies-need" class="table-of-contents__link">Who Companies Need</a></li><li><a href="#how-to-learn-data-engineering" class="table-of-contents__link">How to Learn Data Engineering</a><ul><li><a href="#interview-with-andreas-on-the-super-data-science-podcast" class="table-of-contents__link">Interview with Andreas on the Super Data Science Podcast</a></li><li><a href="#building-blocks-to-learn-data-engineering" class="table-of-contents__link">Building Blocks to Learn Data Engineering</a></li><li><a href="#roadmap-for-data-analysts" class="table-of-contents__link">Roadmap for Data Analysts</a></li><li><a href="#roadmap-for-data-scientists" class="table-of-contents__link">Roadmap for Data Scientists</a></li><li><a href="#roadmap-for-software-engineers" class="table-of-contents__link">Roadmap for Software Engineers</a></li></ul></li><li><a href="#data-engineers-skills-matrix" class="table-of-contents__link">Data Engineers Skills Matrix</a><ul><li><a href="#challenges--responsibilities" class="table-of-contents__link">Challenges &amp; Responsibilities</a></li><li><a href="#sql--soft-skills" class="table-of-contents__link">SQL &amp; Soft Skills</a></li><li><a href="#qa" class="table-of-contents__link">Q&amp;A</a></li><li><a href="#summary-1" class="table-of-contents__link">Summary</a></li></ul></li><li><a href="#how-to-become-a-senior-data-engineer" class="table-of-contents__link">How to Become a Senior Data Engineer</a><ul><li><a href="#understanding-the-role" class="table-of-contents__link">Understanding the Role</a></li><li><a href="#key-skills-and-knowledge-areas" class="table-of-contents__link">Key Skills and Knowledge Areas</a></li><li><a href="#learning-pathways" class="table-of-contents__link">Learning Pathways</a></li><li><a href="#video-1" class="table-of-contents__link">Video 1</a></li><li><a href="#video-2" class="table-of-contents__link">Video 2</a></li><li><a href="#video-3" class="table-of-contents__link">Video 3</a></li><li><a href="#final-thoughts" class="table-of-contents__link">Final Thoughts</a></li></ul></li></ul></div></div></div></div></main></div></div><footer class="footer"><div class="container"><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2024 Andreas Kretz. Built by Kristijan Bakaric with Docusaurus.</div></div></div></footer></div>
 <script src="/assets/js/styles.4aacfb65.js"></script>
-<script src="/assets/js/runtime~main.33a05ad1.js"></script>
+<script src="/assets/js/runtime~main.96f9bfc8.js"></script>
 <script src="/assets/js/main.da980c98.js"></script>
 <script src="/assets/js/1.4743d043.js"></script>
 <script src="/assets/js/2.d8ee9dd6.js"></script>
diff --git a/docs/02-BasicSkills/index.html b/docs/02-BasicSkills/index.html
index 1954c1f..fca38cf 100644
--- a/docs/02-BasicSkills/index.html
+++ b/docs/02-BasicSkills/index.html
@@ -7,7 +7,7 @@
 <link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="THE DATA ENGINEERING COOKBOOK Blog RSS Feed">
 <link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="THE DATA ENGINEERING COOKBOOK Blog Atom Feed"><title data-react-helmet="true">02-BasicSkills | THE DATA ENGINEERING COOKBOOK</title><meta data-react-helmet="true" name="twitter:card" content="summary_large_image"><meta data-react-helmet="true" name="docusaurus_locale" content="en"><meta data-react-helmet="true" name="docusaurus_version" content="current"><meta data-react-helmet="true" name="docusaurus_tag" content="docs-default-current"><meta data-react-helmet="true" property="og:title" content="02-BasicSkills | THE DATA ENGINEERING COOKBOOK"><meta data-react-helmet="true" name="description" content="Basic Computer Science Skills"><meta data-react-helmet="true" property="og:description" content="Basic Computer Science Skills"><meta data-react-helmet="true" property="og:url" content="http://cookbook.learndataengineering.com/docs/02-BasicSkills"><link data-react-helmet="true" rel="shortcut icon" href="/images/CookbookCover.jpg"><link data-react-helmet="true" rel="alternate" href="http://cookbook.learndataengineering.com/docs/02-BasicSkills" hreflang="x-default"><link data-react-helmet="true" rel="canonical" href="http://cookbook.learndataengineering.com/docs/02-BasicSkills"><link rel="stylesheet" href="/assets/css/styles.5ba6ab3f.css">
 <link rel="preload" href="/assets/js/styles.4aacfb65.js" as="script">
-<link rel="preload" href="/assets/js/runtime~main.33a05ad1.js" as="script">
+<link rel="preload" href="/assets/js/runtime~main.96f9bfc8.js" as="script">
 <link rel="preload" href="/assets/js/main.da980c98.js" as="script">
 <link rel="preload" href="/assets/js/1.4743d043.js" as="script">
 <link rel="preload" href="/assets/js/2.d8ee9dd6.js" as="script">
@@ -199,7 +199,7 @@
 a IoT device, logs, or whatever.</p><p>A data catalog is also important. It explains which features are available and how different data sets are labeled.</p><p>All different types of data. Now, here comes the engineering part.</p><p>The Data Engineers part, is making this data available. Available to the data scientist and the machine learning process.</p><p>So when you look at the model, on the left side you have your hyper parameter configuration. You need to store and manage these configurations somehow.</p><p>Then you have the actual training data.</p><p>There&#x27;s a lot going on with the training data:</p><p>Where does it come from? Who owns it? Which is basically data governance.</p><p>What&#x27;s the lineage? Have you modified this data? What did you do, what was the basis, the raw data?</p><p>You need to access all this data somehow. In training and in production.</p><p>In production you need to have access to the live data.</p><p>All this is the data engineers job. Making the data available.</p><p>First an architect needs to build the platform. This can also be a good data engineer.</p><p>Then the data engineer needs to build the pipelines. How is the data coming in and how is the platform
 connecting to other systems.</p><p>How is that data then put into the storage. Is there a pre processing for the algorithms necessary? He&#x27;ll do it.</p><p>Once the data and the systems are available, it&#x27;s time for the machine learning part.</p><p>It is ready for processing. Basically ready for the data scientist.</p><p>Once the analytics is done the data engineer needs to build pipelines to make it then accessible again. For instance for other analytics processes, for APIs, for front ends and so on.</p><p>All in all, the data engineer&#x27;s part is a computer science part.</p><p>That&#x27;s why I love it so much :)</p></div></article><div class="margin-vert--lg"><nav class="pagination-nav" aria-label="Blog list page navigation"><div class="pagination-nav__item"><a class="pagination-nav__link" href="/docs/01-Introduction"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">« 01-Introduction</div></a></div><div class="pagination-nav__item pagination-nav__item--next"><a class="pagination-nav__link" href="/docs/03-AdvancedSkills"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">03-AdvancedSkills »</div></a></div></nav></div></div></div><div class="col col--3"><div class="tableOfContents_35-E thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#contents" class="table-of-contents__link">Contents</a></li><li><a href="#learn-to-code" class="table-of-contents__link">Learn to Code</a></li><li><a href="#get-familiar-with-git" class="table-of-contents__link">Get Familiar with Git</a></li><li><a href="#agile-development" class="table-of-contents__link">Agile Development</a><ul><li><a href="#why-is-agile-so-important" class="table-of-contents__link">Why Is Agile So Important?</a></li><li><a href="#agile-rules-i-learned-over-the-years" class="table-of-contents__link">Agile Rules I Learned Over the Years</a></li><li><a href="#agile-frameworks" class="table-of-contents__link">Agile Frameworks</a></li><li><a href="#software-engineering-culture" class="table-of-contents__link">Software Engineering Culture</a></li></ul></li><li><a href="#learn-how-a-computer-works" class="table-of-contents__link">Learn How a Computer Works</a><ul><li><a href="#cpuramgpuhdd" class="table-of-contents__link">CPU,RAM,GPU,HDD</a></li><li><a href="#differences-between-pcs-and-servers" class="table-of-contents__link">Differences Between PCs and Servers</a></li></ul></li><li><a href="#data-network-transmission" class="table-of-contents__link">Data Network Transmission</a><ul><li><a href="#osi-model" class="table-of-contents__link">OSI Model</a></li><li><a href="#ip-subnetting" class="table-of-contents__link">IP Subnetting</a></li><li><a href="#switch-layer-3-switch" class="table-of-contents__link">Switch, Layer-3 Switch</a></li><li><a href="#router" class="table-of-contents__link">Router</a></li><li><a href="#firewalls" class="table-of-contents__link">Firewalls</a></li></ul></li><li><a href="#security-and-privacy" class="table-of-contents__link">Security and Privacy</a><ul><li><a href="#ssl-public-and-private-key-certificates" class="table-of-contents__link">SSL Public and Private Key Certificates</a></li><li><a href="#json-web-tokens" class="table-of-contents__link">JSON Web Tokens</a></li><li><a href="#gdpr-regulations" class="table-of-contents__link">GDPR Regulations</a></li><li><a href="#privacy-by-design" class="table-of-contents__link">Privacy by Design</a></li></ul></li><li><a href="#linux" class="table-of-contents__link">Linux</a><ul><li><a href="#os-basics" class="table-of-contents__link">OS Basics</a></li><li><a href="#shell-scripting" class="table-of-contents__link">Shell scripting</a></li><li><a href="#cron-jobs" class="table-of-contents__link">Cron Jobs</a></li><li><a href="#packet-management" class="table-of-contents__link">Packet Management</a></li></ul></li><li><a href="#docker" class="table-of-contents__link">Docker</a><ul><li><a href="#what-is-docker-and-what-do-you-use-it-for" class="table-of-contents__link">What is Docker, and What Do You Use It for?</a></li><li><a href="#kubernetes-container-deployment" class="table-of-contents__link">Kubernetes Container Deployment</a></li><li><a href="#how-to-create-start-stop-a-container" class="table-of-contents__link">How to Create, Start, Stop a Container</a></li><li><a href="#docker-micro-services" class="table-of-contents__link">Docker Micro-Services?</a></li><li><a href="#kubernetes" class="table-of-contents__link">Kubernetes</a></li><li><a href="#why-and-how-to-do-docker-container-orchestration" class="table-of-contents__link">Why and How to Do Docker Container Orchestration</a></li><li><a href="#useful-docker-commands" class="table-of-contents__link">Useful Docker Commands</a></li></ul></li><li><a href="#the-cloud" class="table-of-contents__link">The Cloud</a><ul><li><a href="#iaas-vs-paas-vs-saas" class="table-of-contents__link">IaaS vs. PaaS vs. SaaS</a></li><li><a href="#aws-azure-ibm-google" class="table-of-contents__link">AWS, Azure, IBM, Google</a></li><li><a href="#cloud-vs-on-premises" class="table-of-contents__link">Cloud vs. On-Premises</a></li><li><a href="#security" class="table-of-contents__link">Security</a></li><li><a href="#hybrid-clouds" class="table-of-contents__link">Hybrid Clouds</a></li></ul></li><li><a href="#machine-learning-workflow" class="table-of-contents__link">Machine Learning Workflow</a></li><li><a href="#machine-learning-model-and-data" class="table-of-contents__link">Machine Learning Model and Data</a></li></ul></div></div></div></div></main></div></div><footer class="footer"><div class="container"><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2024 Andreas Kretz. Built by Kristijan Bakaric with Docusaurus.</div></div></div></footer></div>
 <script src="/assets/js/styles.4aacfb65.js"></script>
-<script src="/assets/js/runtime~main.33a05ad1.js"></script>
+<script src="/assets/js/runtime~main.96f9bfc8.js"></script>
 <script src="/assets/js/main.da980c98.js"></script>
 <script src="/assets/js/1.4743d043.js"></script>
 <script src="/assets/js/2.d8ee9dd6.js"></script>
diff --git a/docs/03-AdvancedSkills/index.html b/docs/03-AdvancedSkills/index.html
index 09c93ad..ce1d816 100644
--- a/docs/03-AdvancedSkills/index.html
+++ b/docs/03-AdvancedSkills/index.html
@@ -7,7 +7,7 @@
 <link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="THE DATA ENGINEERING COOKBOOK Blog RSS Feed">
 <link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="THE DATA ENGINEERING COOKBOOK Blog Atom Feed"><title data-react-helmet="true">03-AdvancedSkills | THE DATA ENGINEERING COOKBOOK</title><meta data-react-helmet="true" name="twitter:card" content="summary_large_image"><meta data-react-helmet="true" name="docusaurus_locale" content="en"><meta data-react-helmet="true" name="docusaurus_version" content="current"><meta data-react-helmet="true" name="docusaurus_tag" content="docs-default-current"><meta data-react-helmet="true" property="og:title" content="03-AdvancedSkills | THE DATA ENGINEERING COOKBOOK"><meta data-react-helmet="true" name="description" content="Advanced Data Engineering Skills"><meta data-react-helmet="true" property="og:description" content="Advanced Data Engineering Skills"><meta data-react-helmet="true" property="og:url" content="http://cookbook.learndataengineering.com/docs/03-AdvancedSkills"><link data-react-helmet="true" rel="shortcut icon" href="/images/CookbookCover.jpg"><link data-react-helmet="true" rel="alternate" href="http://cookbook.learndataengineering.com/docs/03-AdvancedSkills" hreflang="x-default"><link data-react-helmet="true" rel="canonical" href="http://cookbook.learndataengineering.com/docs/03-AdvancedSkills"><link rel="stylesheet" href="/assets/css/styles.5ba6ab3f.css">
 <link rel="preload" href="/assets/js/styles.4aacfb65.js" as="script">
-<link rel="preload" href="/assets/js/runtime~main.33a05ad1.js" as="script">
+<link rel="preload" href="/assets/js/runtime~main.96f9bfc8.js" as="script">
 <link rel="preload" href="/assets/js/main.da980c98.js" as="script">
 <link rel="preload" href="/assets/js/1.4743d043.js" as="script">
 <link rel="preload" href="/assets/js/2.d8ee9dd6.js" as="script">
@@ -436,7 +436,7 @@
 already there.</p><p><strong>They just need to show us that the algorithms work. The end.</strong></p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="aws-sagemaker"></a>AWS Sagemaker<a class="hash-link" href="#aws-sagemaker" title="Direct link to heading">#</a></h3><p>Train and apply models online with Sagemaker</p><p>Link to the OLX Slideshare with pros, cons and how to use Sagemaker:
 <a href="https://www.slideshare.net/mobile/AlexeyGrigorev/image-models-infrastructure-at-olx" target="_blank" rel="noopener noreferrer">https://www.slideshare.net/mobile/AlexeyGrigorev/image-models-infrastructure-at-olx</a></p></div></article><div class="margin-vert--lg"><nav class="pagination-nav" aria-label="Blog list page navigation"><div class="pagination-nav__item"><a class="pagination-nav__link" href="/docs/02-BasicSkills"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">« 02-BasicSkills</div></a></div><div class="pagination-nav__item pagination-nav__item--next"><a class="pagination-nav__link" href="/docs/04-HandsOnCourse"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">04-HandsOnCourse »</div></a></div></nav></div></div></div><div class="col col--3"><div class="tableOfContents_35-E thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#contents" class="table-of-contents__link">Contents</a></li><li><a href="#data-science-platform" class="table-of-contents__link">Data Science Platform</a><ul><li><a href="#why-a-good-data-platform-is-important" class="table-of-contents__link">Why a Good Data Platform Is Important</a></li><li><a href="#big-data-vs-data-science-and-analytics" class="table-of-contents__link">Big Data vs Data Science and Analytics</a></li><li><a href="#the-4-vs-of-big-data" class="table-of-contents__link">The 4 Vs of Big Data</a></li><li><a href="#why-big-data" class="table-of-contents__link">Why Big Data?</a></li><li><a href="#planning-is-everything" class="table-of-contents__link">Planning is Everything</a></li><li><a href="#the-problem-with-etl" class="table-of-contents__link">The problem with ETL</a></li><li><a href="#scaling-up" class="table-of-contents__link">Scaling Up</a></li><li><a href="#scaling-out" class="table-of-contents__link">Scaling Out</a></li><li><a href="#please-dont-go-big-data" class="table-of-contents__link">Please don&#39;t go Big Data</a></li></ul></li><li><a href="#connect" class="table-of-contents__link">Connect</a><ul><li><a href="#rest-apis" class="table-of-contents__link">REST APIs</a></li><li><a href="#apache-nifi" class="table-of-contents__link">Apache Nifi</a></li><li><a href="#logstash" class="table-of-contents__link">Logstash</a></li><li><a href="#fluentd" class="table-of-contents__link">FluentD</a></li><li><a href="#apache-flume" class="table-of-contents__link">Apache Flume</a></li><li><a href="#sqoop" class="table-of-contents__link">Sqoop</a></li><li><a href="#azure-iothub" class="table-of-contents__link">Azure IoTHub</a></li></ul></li><li><a href="#buffer" class="table-of-contents__link">Buffer</a><ul><li><a href="#apache-kafka" class="table-of-contents__link">Apache Kafka</a></li><li><a href="#redis-pub-sub" class="table-of-contents__link">Redis Pub-Sub</a></li><li><a href="#aws-kinesis" class="table-of-contents__link">AWS Kinesis</a></li><li><a href="#google-cloud-pubsub" class="table-of-contents__link">Google Cloud PubSub</a></li></ul></li><li><a href="#processing-frameworks" class="table-of-contents__link">Processing Frameworks</a><ul><li><a href="#lambda-and-kappa-architecture" class="table-of-contents__link">Lambda and Kappa Architecture</a></li><li><a href="#batch-processing" class="table-of-contents__link">Batch Processing</a></li><li><a href="#stream-processing" class="table-of-contents__link">Stream Processing</a></li><li><a href="#should-you-do-stream-or-batch-processing" class="table-of-contents__link">Should you do stream or batch processing?</a></li><li><a href="#is-etl-still-relevant-for-analytics" class="table-of-contents__link">Is ETL still relevant for Analytics?</a></li><li><a href="#mapreduce" class="table-of-contents__link">MapReduce</a></li><li><a href="#apache-spark" class="table-of-contents__link">Apache Spark</a></li><li><a href="#samza" class="table-of-contents__link">Samza</a></li><li><a href="#aws-lambda" class="table-of-contents__link">AWS Lambda</a></li><li><a href="#apache-flink" class="table-of-contents__link">Apache Flink</a></li><li><a href="#elasticsearch" class="table-of-contents__link">Elasticsearch</a></li><li><a href="#graph-db" class="table-of-contents__link">Graph DB</a></li><li><a href="#apache-solr" class="table-of-contents__link">Apache Solr</a></li><li><a href="#apache-drill" class="table-of-contents__link">Apache Drill</a></li><li><a href="#apache-storm" class="table-of-contents__link">Apache Storm</a></li><li><a href="#streamsets" class="table-of-contents__link">StreamSets</a></li></ul></li><li><a href="#store" class="table-of-contents__link">Store</a><ul><li><a href="#analytical-data-stores" class="table-of-contents__link">Analytical Data Stores</a></li><li><a href="#transactional-data-stores" class="table-of-contents__link">Transactional Data Stores</a></li></ul></li><li><a href="#visualize" class="table-of-contents__link">Visualize</a><ul><li><a href="#android--ios" class="table-of-contents__link">Android &amp; IOS</a></li><li><a href="#how-to-design-apis-for-mobile-apps" class="table-of-contents__link">How to design APIs for mobile apps</a></li><li><a href="#how-to-use-webservers-to-display-content" class="table-of-contents__link">How to use Webservers to display content</a></li><li><a href="#dashboards" class="table-of-contents__link">Dashboards</a></li><li><a href="#business-intelligence-tools" class="table-of-contents__link">Business Intelligence Tools</a></li><li><a href="#identity--device-management" class="table-of-contents__link">Identity &amp; Device Management</a></li></ul></li><li><a href="#machine-learning" class="table-of-contents__link">Machine Learning</a><ul><li><a href="#how-to-do-machine-learning-in-production" class="table-of-contents__link">How to do Machine Learning in production</a></li><li><a href="#why-machine-learning-in-production-is-harder-then-you-think" class="table-of-contents__link">Why machine learning in production is harder then you think</a></li><li><a href="#models-do-not-work-forever" class="table-of-contents__link">Models Do Not Work Forever</a></li><li><a href="#where-the-platforms-that-support-this" class="table-of-contents__link">Where The Platforms That Support This?</a></li><li><a href="#training-parameter-management" class="table-of-contents__link">Training Parameter Management</a></li><li><a href="#whats-your-solution" class="table-of-contents__link">What&#39;s Your Solution?</a></li><li><a href="#how-to-convince-people-machine-learning-works" class="table-of-contents__link">How to convince people machine learning works</a></li><li><a href="#no-rules-no-physical-models" class="table-of-contents__link">No Rules, No Physical Models</a></li><li><a href="#you-have-the-data-use-it" class="table-of-contents__link">You Have The Data. USE IT!</a></li><li><a href="#data-is-stronger-than-opinions" class="table-of-contents__link">Data is Stronger Than Opinions</a></li><li><a href="#aws-sagemaker" class="table-of-contents__link">AWS Sagemaker</a></li></ul></li></ul></div></div></div></div></main></div></div><footer class="footer"><div class="container"><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2024 Andreas Kretz. Built by Kristijan Bakaric with Docusaurus.</div></div></div></footer></div>
 <script src="/assets/js/styles.4aacfb65.js"></script>
-<script src="/assets/js/runtime~main.33a05ad1.js"></script>
+<script src="/assets/js/runtime~main.96f9bfc8.js"></script>
 <script src="/assets/js/main.da980c98.js"></script>
 <script src="/assets/js/1.4743d043.js"></script>
 <script src="/assets/js/2.d8ee9dd6.js"></script>
diff --git a/docs/04-HandsOnCourse/index.html b/docs/04-HandsOnCourse/index.html
index 123a5f3..172afcf 100644
--- a/docs/04-HandsOnCourse/index.html
+++ b/docs/04-HandsOnCourse/index.html
@@ -7,7 +7,7 @@
 <link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="THE DATA ENGINEERING COOKBOOK Blog RSS Feed">
 <link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="THE DATA ENGINEERING COOKBOOK Blog Atom Feed"><title data-react-helmet="true">04-HandsOnCourse | THE DATA ENGINEERING COOKBOOK</title><meta data-react-helmet="true" name="twitter:card" content="summary_large_image"><meta data-react-helmet="true" name="docusaurus_locale" content="en"><meta data-react-helmet="true" name="docusaurus_version" content="current"><meta data-react-helmet="true" name="docusaurus_tag" content="docs-default-current"><meta data-react-helmet="true" property="og:title" content="04-HandsOnCourse | THE DATA ENGINEERING COOKBOOK"><meta data-react-helmet="true" name="description" content="Data Engineering Course: Building A Data Platform"><meta data-react-helmet="true" property="og:description" content="Data Engineering Course: Building A Data Platform"><meta data-react-helmet="true" property="og:url" content="http://cookbook.learndataengineering.com/docs/04-HandsOnCourse"><link data-react-helmet="true" rel="shortcut icon" href="/images/CookbookCover.jpg"><link data-react-helmet="true" rel="alternate" href="http://cookbook.learndataengineering.com/docs/04-HandsOnCourse" hreflang="x-default"><link data-react-helmet="true" rel="canonical" href="http://cookbook.learndataengineering.com/docs/04-HandsOnCourse"><link rel="stylesheet" href="/assets/css/styles.5ba6ab3f.css">
 <link rel="preload" href="/assets/js/styles.4aacfb65.js" as="script">
-<link rel="preload" href="/assets/js/runtime~main.33a05ad1.js" as="script">
+<link rel="preload" href="/assets/js/runtime~main.96f9bfc8.js" as="script">
 <link rel="preload" href="/assets/js/main.da980c98.js" as="script">
 <link rel="preload" href="/assets/js/1.4743d043.js" as="script">
 <link rel="preload" href="/assets/js/2.d8ee9dd6.js" as="script">
@@ -15,11 +15,11 @@
 <link rel="preload" href="/assets/js/19.f0bdf1f6.js" as="script">
 <link rel="preload" href="/assets/js/935f2afb.cff03dc6.js" as="script">
 <link rel="preload" href="/assets/js/17896441.aacbb830.js" as="script">
-<link rel="preload" href="/assets/js/b42f3af8.d55ebf91.js" as="script">
+<link rel="preload" href="/assets/js/b42f3af8.f460278c.js" as="script">
 </head>
 <body>
 <script>!function(){function e(e){document.documentElement.setAttribute("data-theme",e)}var t=function(){var e=null;try{e=localStorage.getItem("theme")}catch(e){}return e}();null!==t?e(t):window.matchMedia("(prefers-color-scheme: dark)").matches?e("dark"):(window.matchMedia("(prefers-color-scheme: light)").matches,e("light"))}()</script><div id="__docusaurus">
-<nav aria-label="Skip navigation links"><button type="button" tabindex="0" class="skipToContent_1oUP">Skip to main content</button></nav><nav class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><div aria-label="Navigation bar toggle" class="navbar__toggle" role="button" tabindex="0"><svg aria-label="Menu" width="30" height="30" viewBox="0 0 30 30" role="img" focusable="false"><title>Menu</title><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></div><a class="navbar__brand" href="/"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--light_3UqQ navbar__logo"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--dark_hz6m navbar__logo"><strong class="navbar__title">Data Engineering Cookbook</strong></a><a class="navbar__item navbar__link" href="/docs/01-Introduction">Cookbook</a></div><div class="navbar__items navbar__items--right"><a href="https://learndataengineering.com/" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">Data Engineering Academy</a><a href="https://medium.com/plumbersofdatascience" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">Plumbers Of Data Science</a><a href="https://github.com/andkret/Cookbook" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">GitHub</a></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div><div class="navbar-sidebar"><div class="navbar-sidebar__brand"><a class="navbar__brand" href="/"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--light_3UqQ navbar__logo"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--dark_hz6m navbar__logo"><strong class="navbar__title">Data Engineering Cookbook</strong></a></div><div class="navbar-sidebar__items"><div class="menu"><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" href="/docs/01-Introduction">Cookbook</a></li><li class="menu__list-item"><a href="https://learndataengineering.com/" target="_blank" rel="noopener noreferrer" class="menu__link">Data Engineering Academy</a></li><li class="menu__list-item"><a href="https://medium.com/plumbersofdatascience" target="_blank" rel="noopener noreferrer" class="menu__link">Plumbers Of Data Science</a></li><li class="menu__list-item"><a href="https://github.com/andkret/Cookbook" target="_blank" rel="noopener noreferrer" class="menu__link">GitHub</a></li></ul></div></div></div></nav><div class="main-wrapper"><div class="docPage_31aa"><div class="docSidebarContainer_3Kbt" role="complementary"><div class="sidebar_15mo"><div class="menu menu--responsive thin-scrollbar menu_Bmed"><button aria-label="Open Menu" aria-haspopup="true" class="button button--secondary button--sm menu__button" type="button"><svg aria-label="Menu" class="sidebarMenuIcon_fgN0" width="24" height="24" viewBox="0 0 30 30" role="img" focusable="false"><title>Menu</title><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><ul class="menu__list"><li class="menu__list-item"><a class="menu__link menu__link--sublist menu__link--active" href="#!">Data Engineering</a><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/01-Introduction">01-Introduction</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/02-BasicSkills">02-BasicSkills</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/03-AdvancedSkills">03-AdvancedSkills</a></li><li class="menu__list-item"><a aria-current="page" class="menu__link menu__link--active active" tabindex="0" href="/docs/04-HandsOnCourse">04-HandsOnCourse</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/05-CaseStudies">05-CaseStudies</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/06-BestPracticesCloud">06-BestPracticesCloud</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/07-DataSources">07-DataSources</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/08-InterviewQuestions">08-InterviewQuestions</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/09-BooksAndCourses">09-BooksAndCourses</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/10-Updates">10-Updates</a></li></ul></li></ul></div></div></div><main class="docMainContainer_3ufF"><div class="container padding-vert--lg docItemWrapper_3FMP"><div class="row"><div class="col docItemCol_3FnS"><div class="docItemContainer_33ec"><article><header><h1 class="docTitle_3a4h">04-HandsOnCourse</h1></header><div class="markdown"><h1><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="data-engineering-course-building-a-data-platform"></a>Data Engineering Course: Building A Data Platform<a class="hash-link" href="#data-engineering-course-building-a-data-platform" title="Direct link to heading">#</a></h1><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="contents"></a>Contents<a class="hash-link" href="#contents" title="Direct link to heading">#</a></h2><ul><li><a href="/docs/04-HandsOnCourse#genai-retrieval-augmented-generation-with-ollama-and-elasticsearch">GenAI Retrieval Augmented Generation with Ollama and ElasticSearch</a></li><li><a href="/docs/04-HandsOnCourse#free-data-engineering-course-with-aws-tdengine-docker-and-grafana">Free Data Engineering Course with AWS, TDengine, Docker and Grafana</a></li><li><a href="/docs/04-HandsOnCourse#monitor-your-data-in-dbt-and-detect-quality-issues-with-elementary">Monitor your data in dbt &amp; detect quality issues with Elementary</a></li><li><a href="/docs/04-HandsOnCourse#solving-engineers-4-biggest-airflow-problems">Solving Engineers 4 Biggest Airflow Problems</a></li><li><a href="/docs/04-HandsOnCourse#the-best-alternative-to-airlfow?-mage.ai">The best alternative to Airlfow? Mage.ai</a></li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="genai-retrieval-augmented-generation-with-ollama-and-elasticsearch"></a>GenAI Retrieval Augmented Generation with Ollama and ElasticSearch<a class="hash-link" href="#genai-retrieval-augmented-generation-with-ollama-and-elasticsearch" title="Direct link to heading">#</a></h2><ul><li>This how-to is based on this one from Elasticsearch: <a href="https://www.elastic.co/search-labs/blog/rag-with-llamaIndex-and-elasticsearch" target="_blank" rel="noopener noreferrer">https://www.elastic.co/search-labs/blog/rag-with-llamaIndex-and-elasticsearch</a></li><li>Instead of Elasticsearch cloud we&#x27;re going to run everything locally</li><li>The simplest way to get this done is to just clone this GitHub Repo for the code and docker setup</li><li>I&#x27;ve tried this on a M1 Mac. Changes for Windows with WSL will come later.</li><li>The biggest problems that I had were actually installing the dependencies rather than the code itself.</li></ul><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="install-ollama"></a>Install Ollama<a class="hash-link" href="#install-ollama" title="Direct link to heading">#</a></h3><ol><li>Download Ollama from here <a href="https://ollama.com/download/mac" target="_blank" rel="noopener noreferrer">https://ollama.com/download/mac</a></li><li>Unzip, drag into applications and install</li><li>do <code>ollama run mistral</code> (It&#x27;s going to download the Mistral 7b model, 4.1GB size)</li><li>Create a new folder in Documents &quot;Elasticsearch-RAG&quot;</li><li>Open that folder in VSCode</li></ol><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="install-elasticsearch--kibana-docker"></a>Install Elasticsearch &amp; Kibana (Docker)<a class="hash-link" href="#install-elasticsearch--kibana-docker" title="Direct link to heading">#</a></h3><ol><li>Use the docker-compose file from this repo: <a href="https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/docker-compose.yml" target="_blank" rel="noopener noreferrer">https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/docker-compose.yml</a></li><li>Download Docker Desktop from here: <a href="https://www.docker.com/products/docker-desktop/" target="_blank" rel="noopener noreferrer">https://www.docker.com/products/docker-desktop/</a></li><li>Install docker desktop and sign in in the app/create a user -&gt; sends you to the browser</li></ol><p><strong>For Windows Users</strong>
+<nav aria-label="Skip navigation links"><button type="button" tabindex="0" class="skipToContent_1oUP">Skip to main content</button></nav><nav class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><div aria-label="Navigation bar toggle" class="navbar__toggle" role="button" tabindex="0"><svg aria-label="Menu" width="30" height="30" viewBox="0 0 30 30" role="img" focusable="false"><title>Menu</title><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></div><a class="navbar__brand" href="/"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--light_3UqQ navbar__logo"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--dark_hz6m navbar__logo"><strong class="navbar__title">Data Engineering Cookbook</strong></a><a class="navbar__item navbar__link" href="/docs/01-Introduction">Cookbook</a></div><div class="navbar__items navbar__items--right"><a href="https://learndataengineering.com/" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">Data Engineering Academy</a><a href="https://medium.com/plumbersofdatascience" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">Plumbers Of Data Science</a><a href="https://github.com/andkret/Cookbook" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">GitHub</a></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div><div class="navbar-sidebar"><div class="navbar-sidebar__brand"><a class="navbar__brand" href="/"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--light_3UqQ navbar__logo"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--dark_hz6m navbar__logo"><strong class="navbar__title">Data Engineering Cookbook</strong></a></div><div class="navbar-sidebar__items"><div class="menu"><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" href="/docs/01-Introduction">Cookbook</a></li><li class="menu__list-item"><a href="https://learndataengineering.com/" target="_blank" rel="noopener noreferrer" class="menu__link">Data Engineering Academy</a></li><li class="menu__list-item"><a href="https://medium.com/plumbersofdatascience" target="_blank" rel="noopener noreferrer" class="menu__link">Plumbers Of Data Science</a></li><li class="menu__list-item"><a href="https://github.com/andkret/Cookbook" target="_blank" rel="noopener noreferrer" class="menu__link">GitHub</a></li></ul></div></div></div></nav><div class="main-wrapper"><div class="docPage_31aa"><div class="docSidebarContainer_3Kbt" role="complementary"><div class="sidebar_15mo"><div class="menu menu--responsive thin-scrollbar menu_Bmed"><button aria-label="Open Menu" aria-haspopup="true" class="button button--secondary button--sm menu__button" type="button"><svg aria-label="Menu" class="sidebarMenuIcon_fgN0" width="24" height="24" viewBox="0 0 30 30" role="img" focusable="false"><title>Menu</title><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><ul class="menu__list"><li class="menu__list-item"><a class="menu__link menu__link--sublist menu__link--active" href="#!">Data Engineering</a><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/01-Introduction">01-Introduction</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/02-BasicSkills">02-BasicSkills</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/03-AdvancedSkills">03-AdvancedSkills</a></li><li class="menu__list-item"><a aria-current="page" class="menu__link menu__link--active active" tabindex="0" href="/docs/04-HandsOnCourse">04-HandsOnCourse</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/05-CaseStudies">05-CaseStudies</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/06-BestPracticesCloud">06-BestPracticesCloud</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/07-DataSources">07-DataSources</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/08-InterviewQuestions">08-InterviewQuestions</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/09-BooksAndCourses">09-BooksAndCourses</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/10-Updates">10-Updates</a></li></ul></li></ul></div></div></div><main class="docMainContainer_3ufF"><div class="container padding-vert--lg docItemWrapper_3FMP"><div class="row"><div class="col docItemCol_3FnS"><div class="docItemContainer_33ec"><article><header><h1 class="docTitle_3a4h">04-HandsOnCourse</h1></header><div class="markdown"><h1><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="data-engineering-course-building-a-data-platform"></a>Data Engineering Course: Building A Data Platform<a class="hash-link" href="#data-engineering-course-building-a-data-platform" title="Direct link to heading">#</a></h1><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="contents"></a>Contents<a class="hash-link" href="#contents" title="Direct link to heading">#</a></h2><ul><li><a href="/docs/04-HandsOnCourse#genai-retrieval-augmented-generation-with-ollama-and-elasticsearch">GenAI Retrieval Augmented Generation with Ollama and Elasticsearch</a></li><li><a href="/docs/04-HandsOnCourse#free-data-engineering-course-with-aws-tdengine-docker-and-grafana">Free Data Engineering Course with AWS, TDengine, Docker and Grafana</a></li><li><a href="/docs/04-HandsOnCourse#monitor-your-data-in-dbt-and-detect-quality-issues-with-elementary">Monitor your data in dbt &amp; detect quality issues with Elementary</a></li><li><a href="/docs/04-HandsOnCourse#solving-engineers-4-biggest-airflow-problems">Solving Engineers 4 Biggest Airflow Problems</a></li><li><a href="/docs/04-HandsOnCourse#the-best-alternative-to-airlfow?-mage.ai">The best alternative to Airlfow? Mage.ai</a></li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="genai-retrieval-augmented-generation-with-ollama-and-elasticsearch"></a>GenAI Retrieval Augmented Generation with Ollama and Elasticsearch<a class="hash-link" href="#genai-retrieval-augmented-generation-with-ollama-and-elasticsearch" title="Direct link to heading">#</a></h2><ul><li>This how-to is based on this one from Elasticsearch: <a href="https://www.elastic.co/search-labs/blog/rag-with-llamaIndex-and-elasticsearch" target="_blank" rel="noopener noreferrer">https://www.elastic.co/search-labs/blog/rag-with-llamaIndex-and-elasticsearch</a></li><li>Instead of Elasticsearch cloud we&#x27;re going to run everything locally</li><li>The simplest way to get this done is to just clone this GitHub Repo for the code and docker setup</li><li>I&#x27;ve tried this on a M1 Mac. Changes for Windows with WSL will come later.</li><li>The biggest problems that I had were actually installing the dependencies rather than the code itself.</li></ul><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="install-ollama"></a>Install Ollama<a class="hash-link" href="#install-ollama" title="Direct link to heading">#</a></h3><ol><li>Download Ollama from here <a href="https://ollama.com/download/mac" target="_blank" rel="noopener noreferrer">https://ollama.com/download/mac</a></li><li>Unzip, drag into applications and install</li><li>do <code>ollama run mistral</code> (It&#x27;s going to download the Mistral 7b model, 4.1GB size)</li><li>Create a new folder in Documents &quot;Elasticsearch-RAG&quot;</li><li>Open that folder in VSCode</li></ol><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="install-elasticsearch--kibana-docker"></a>Install Elasticsearch &amp; Kibana (Docker)<a class="hash-link" href="#install-elasticsearch--kibana-docker" title="Direct link to heading">#</a></h3><ol><li>Use the docker-compose file from this repo: <a href="https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/docker-compose.yml" target="_blank" rel="noopener noreferrer">https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/docker-compose.yml</a></li><li>Download Docker Desktop from here: <a href="https://www.docker.com/products/docker-desktop/" target="_blank" rel="noopener noreferrer">https://www.docker.com/products/docker-desktop/</a></li><li>Install docker desktop and sign in in the app/create a user -&gt; sends you to the browser</li></ol><p><strong>For Windows Users</strong>
 Configure WSL2 to use max only 4GB of ram:</p><div class="mdxCodeBlock_3lFL"><div class="codeBlockContent_hGly"><div tabindex="0" class="prism-code language-undefined codeBlock_23N8 thin-scrollbar"><div class="codeBlockLines_39YC" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain">wsl --shutdown</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">notepad &quot;$env:USERPROFILE/.wslconfig&quot;</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_Ue-o">Copy</button></div></div><p>.wslconfig file:</p><div class="mdxCodeBlock_3lFL"><div class="codeBlockContent_hGly"><div tabindex="0" class="prism-code language-undefined codeBlock_23N8 thin-scrollbar"><div class="codeBlockLines_39YC" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain">[wsl2]</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">memory=4GB   # Limits VM memory in WSL 2 up to 4GB</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_Ue-o">Copy</button></div></div><p><strong>Modify the Linux kernel map count in WSL</strong>
 Do this before the start because Elasticsearch requires a higher value to work
 <code>sudo sysctl -w vm.max_map_count=262144</code></p><ol start="4"><li>go to the Elasticsearch-RAG folder and do <code>docker compose up</code></li><li>make sure you have Elasticsearch 8.11 or later (we use 8.16 here in this project) if you want to use your own Elasticsearch image</li><li>if you get this error on a mac then just open the console in the docker app: <em>error getting credentials - err: exec: docker-credential-desktop: executable file not found in $PATH, out:</em></li><li>Install xcode command line tools: <code>xcode-select --install</code></li><li>make sure you&#x27;re at python 3.8.1 or larger -&gt; installed 3.13.0 from <a href="https://www.python.org/downloads/" target="_blank" rel="noopener noreferrer">https://www.python.org/downloads/</a></li></ol><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="setup-the-virtual-python-environment"></a>Setup the virtual Python environment<a class="hash-link" href="#setup-the-virtual-python-environment" title="Direct link to heading">#</a></h3><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="preparation-on-a-mac"></a>preparation on a Mac<a class="hash-link" href="#preparation-on-a-mac" title="Direct link to heading">#</a></h4><h5><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="install-brew"></a>install brew<a class="hash-link" href="#install-brew" title="Direct link to heading">#</a></h5><p>which brew
@@ -37,9 +37,9 @@
 <code>source .elkrag/bin/activate</code></p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="install-required-libraries-do-one-at-a-time-so-you-see-errors"></a>Install required libraries (do one at a time so you see errors):<a class="hash-link" href="#install-required-libraries-do-one-at-a-time-so-you-see-errors" title="Direct link to heading">#</a></h3><div class="mdxCodeBlock_3lFL"><div class="codeBlockContent_hGly"><div tabindex="0" class="prism-code language-undefined codeBlock_23N8 thin-scrollbar"><div class="codeBlockLines_39YC" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain">pip install llama-index (optional python3 -m pip install package name)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">pip install llama-index-embeddings-ollama</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">pip install llama-index-llms-ollama</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">pip install llama-index-vector-stores-elasticsearch</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">pip install python-dotenv</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_Ue-o">Copy</button></div></div><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="write-the-data-to-elasticsearch"></a>Write the data to Elasticsearch<a class="hash-link" href="#write-the-data-to-elasticsearch" title="Direct link to heading">#</a></h3><ol><li>create / copy in the index.py file</li><li>download the conversations.json file from the folder code examples/GenAI-RAG</li><li>if you get an error with the execution then check if pedantic version is &lt;2.0 <code>pip show pydantic</code> if not do this: <code>pip install &quot;pydantic&lt;2.0</code></li><li>run the program index.py: <a href="https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/index.py" target="_blank" rel="noopener noreferrer">https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/index.py</a></li></ol><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="check-the-data-in-elasticsearch"></a>Check the data in Elasticsearch<a class="hash-link" href="#check-the-data-in-elasticsearch" title="Direct link to heading">#</a></h3><ol><li>go to kibana http://localhost:5601/app/management/data/index_management/indices and see the new index called calls</li><li>go to dev tools and try out this query <code>GET calls/_search?size=1 http://localhost:5601/app/dev_tools#/console/shell</code></li></ol><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="query-data-from-elasticsearch-and-create-an-output-with-mistral"></a>Query data from elasticsearch and create an output with Mistral<a class="hash-link" href="#query-data-from-elasticsearch-and-create-an-output-with-mistral" title="Direct link to heading">#</a></h3><ol><li>if everything is good then run the query.py file <a href="https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/query.py" target="_blank" rel="noopener noreferrer">https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/query.py</a></li><li>try a few queries :)</li></ol><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="install-libraries-to-extract-text-from-pdfs"></a>Install libraries to extract text from pdfs<a class="hash-link" href="#install-libraries-to-extract-text-from-pdfs" title="Direct link to heading">#</a></h3><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="extract-data-from-cv-and-put-it-into-elasticsearch"></a>Extract data from CV and put it into Elasticsearch<a class="hash-link" href="#extract-data-from-cv-and-put-it-into-elasticsearch" title="Direct link to heading">#</a></h3><p>I created a CV with ChatGPT <a href="https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/Liam_McGivney_CV.pdf" target="_blank" rel="noopener noreferrer">https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/Liam_McGivney_CV.pdf</a></p><p>Install the library to extract text from the pdf
 <code>pip install PyMuPDF</code>
 I had to Shift+Command+p then python clear workspace cache and reload window. Then it saw it :/</p><p>The file cvpipeline.py has the python code for the indexing. It&#x27;s not working right now though!
-<a href="https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/cvpipeline.py" target="_blank" rel="noopener noreferrer">https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/cvpipeline.py</a></p><p>I&#x27;ll keep developing this and update it once it&#x27;s working.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="free-data-engineering-course-with-aws-tdengine-docker-and-grafana"></a>Free Data Engineering Course with AWS TDengine Docker and Grafana<a class="hash-link" href="#free-data-engineering-course-with-aws-tdengine-docker-and-grafana" title="Direct link to heading">#</a></h2><p><strong>Free hands-on course:</strong> <a href="https://youtu.be/eoj-CnrR9jA" target="_blank" rel="noopener noreferrer">Watch on YouTube</a></p><p>In this detailed tutorial video, Andreas guides viewers through creating an end-to-end data pipeline using time series data. The project focuses on fetching weather data from a Weather API, processing it on AWS, storing it in TDengine (a time series database), and visualizing the data with Grafana. Here&#x27;s a concise summary of what the video covers:</p><ol><li><strong>Introduction and Setup:</strong></li></ol><ul><li>The project is introduced along with a GitHub repository containing all necessary resources and a step-by-step guide.</li><li>The pipeline architecture includes an IoT weather station, a Weather API, AWS for processing, TDengine for data storage, and Grafana for visualization.</li></ul><ol start="2"><li><strong>Project Components:</strong></li></ol><ul><li><strong>Weather API:</strong> Utilizes weatherapi.com to fetch weather data.</li><li><strong>AWS Lambda:</strong> Processes the data fetched from the Weather API.</li><li><strong>TDengine:</strong> Serves as the time series database to store processed data. It&#x27;s highlighted for its performance and simplicity, especially for time series data.</li><li><strong>Grafana:</strong> Used for creating dashboards to visualize the time series data.</li></ul><ol start="3"><li><strong>Development and Deployment:</strong></li></ol><ul><li>The local development environment setup includes Python, Docker, and VS Code.</li><li>The tutorial covers the creation of a Docker image for the project and deploying it to AWS Elastic Container Registry (ECR).</li><li>AWS Lambda is then configured to use the Docker image from ECR.</li><li>AWS EventBridge is used to schedule the Lambda function to run at specified intervals.</li></ul><ol start="4"><li><strong>Time Series Data:</strong></li></ol><ul><li>The importance of time series data and the benefits of using a time series database like TDengine over traditional relational databases are discussed.</li><li>TDengine&#x27;s features such as speed, scaling, data retention, and built-in functions for time series data are highlighted.</li></ul><ol start="5"><li><strong>Building the Pipeline:</strong></li></ol><ul><li>Detailed instructions are provided for setting up each component of the pipeline:<ul><li>Fetching weather data from the Weather API.</li><li>Processing and sending the data to TDengine using an AWS Lambda function.</li><li>Visualizing the data with Grafana.</li></ul></li><li>Each step includes code snippets and configurations needed to implement the pipeline.</li></ul><ol start="6"><li><strong>Conclusion:</strong></li></ol><ul><li>The video concludes with a demonstration of the completed pipeline, showing weather data visualizations in Grafana.</li><li>Viewers are encouraged to replicate the project using the resources provided in the GitHub repository linked in the video description.</li></ul><p>This video provides a comprehensive guide to building a data pipeline with a focus on time series data, demonstrating the integration of various technologies and platforms to achieve an end-to-end solution.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="monitor-your-data-in-dbt-and-detect-quality-issues-with-elementary"></a>Monitor your data in dbt and detect quality issues with Elementary<a class="hash-link" href="#monitor-your-data-in-dbt-and-detect-quality-issues-with-elementary" title="Direct link to heading">#</a></h2><p><strong>Free hands-on tutorial:</strong> <a href="https://youtu.be/6fnU91Q2gq0" target="_blank" rel="noopener noreferrer">Watch on YouTube</a></p><p>In this comprehensive tutorial, Andreas delves into the integration of dbt (data build tool) with Elementary to enhance data monitoring and quality detection within Snowflake databases. The tutorial is structured to guide viewers through a hands-on experience, starting with an introduction to a sample project setup and the common challenges faced in monitoring dbt jobs. It then transitions into how Elementary can be utilized to address these challenges effectively.</p><p>Key learning points and tutorial structure include:</p><ol><li><strong>Introduction to the Sample Project:</strong> Andreas showcases a project setup involving Snowflake as the data warehouse, dbt for data modeling and testing, and a visualization tool for data analysis. This setup serves as the basis for the tutorial.</li><li><strong>Challenges in Monitoring dbt Jobs:</strong> Common issues in monitoring dbt jobs are discussed, highlighting the limitations of the dbt interface in providing comprehensive monitoring capabilities.</li><li><strong>Introduction to Elementary:</strong> Elementary is introduced as a dbt-native data observability tool designed to enhance the monitoring and analysis of dbt jobs. It offers both open-source and cloud versions, with the tutorial focusing on the cloud version.</li><li><strong>Setup Requirements:</strong> The tutorial covers the necessary setup on both the Snowflake and dbt sides, including schema creation, user and role configuration in Snowflake, and modifications to the dbt project for integrating with Elementary.</li><li><strong>Elementary&#x27;s User Interface and Features:</strong> A thorough walkthrough of Elementary&#x27;s interface is provided, showcasing its dashboard, test results, model runs, data catalog, and data lineage features. The tool&#x27;s ability to automatically run additional tests, like anomaly detection and schema change detection, is also highlighted.</li><li><strong>Advantages of Using Elementary:</strong> The presenter outlines several benefits of using Elementary, such as easy implementation, native test integration, clean and straightforward UI, and enhanced privacy due to data being stored within the user&#x27;s data warehouse.</li><li><strong>Potential Drawbacks:</strong> Some potential drawbacks are discussed, including the additional load on dbt job execution due to more models being run and limitations in dashboard customization.</li><li><strong>Summary and Verdict:</strong> The tutorial concludes with a summary of the key features and benefits of using Elementary with dbt, emphasizing its value in improving data quality monitoring and detection.</li></ol><p>Overall, viewers are guided through setting up and utilizing Elementary for dbt data monitoring, gaining insights into its capabilities, setup process, and the practical benefits it offers for data quality assurance.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="solving-engineers-4-biggest-airflow-problems"></a>Solving Engineers 4 Biggest Airflow Problems<a class="hash-link" href="#solving-engineers-4-biggest-airflow-problems" title="Direct link to heading">#</a></h2><p><strong>Free hands-on tutorial:</strong> <a href="https://youtu.be/b9bMNEh8bes" target="_blank" rel="noopener noreferrer">Watch on YouTube</a></p><p>In this informative video, Andreas discusses the four major challenges engineers face when working with Apache Airflow and introduces Astronomer, a managed Airflow service that addresses these issues effectively. Astronomer is highlighted as a solution that simplifies Airflow deployment and management, making it easier for engineers to develop, deploy, and monitor their data pipelines. Here&#x27;s a summary of the key points discussed for each challenge and how Astronomer provides solutions:</p><ol><li>Managing Airflow Deployments:</li></ol><ul><li><strong>Challenge:</strong> Setting up and maintaining Airflow deployments is complex and time-consuming, involving configuring cloud instances, managing resources, scaling, and updating the Airflow system.</li><li><strong>Solution with Astronomer:</strong> Offers a straightforward deployment process where users can easily configure their deployments, choose cloud providers (GCP, AWS, Azure), and set up scaling with just a few clicks. Astronomer handles the complexity, making it easier to manage production and quality environments.</li></ul><ol start="2"><li>Development Environment and Deployment:</li></ol><ul><li><strong>Challenge:</strong> Local installation of Airflow is complicated due to its dependency on multiple Docker containers and the need for extensive configuration.</li><li><strong>Solution with Astronomer:</strong> Provides a CLI tool for setting up a local development environment with a single command, simplifying the process of developing, testing, and deploying pipelines. The Astronomer CLI also helps in initializing project templates and deploying Dags to the cloud effortlessly.</li></ul><ol start="3"><li>Source Code Management and CI/CD Pipelines:</li></ol><ul><li><strong>Challenge:</strong> Collaborative development and continuous integration/deployment (CI/CD) are essential but challenging to implement effectively with Airflow alone.</li><li><strong>Solution with Astronomer:</strong> Facilitates easy integration with GitHub for source code management and GitHub Actions for CI/CD. This allows automatic testing and deployment of pipeline code, ensuring a smooth workflow for teams working on pipeline development.</li></ul><ol start="4"><li>Observing Pipelines and Alarms:</li></ol><ul><li><strong>Challenge:</strong> Monitoring data pipelines and getting timely alerts when issues occur is crucial but often difficult to achieve.</li><li><strong>Solution with Astronomer:</strong> The Astronomer platform provides a user-friendly interface for monitoring pipeline status and performance. It also offers customizable alerts for failures or prolonged task durations, with notifications via email, PagerDuty, or Slack, ensuring immediate awareness and response to issues.</li></ul><p>Overall, the video shows Astronomer as a powerful and user-friendly platform that addresses the common challenges of using Airflow, from deployment and development to collaboration, CI/CD, and monitoring. It suggests that Astronomer can significantly improve the experience of engineers working with Airflow, making it easier to manage, develop, and monitor data pipelines.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="the-best-alternative-to-airlfow-mageai"></a>The best alternative to Airlfow? Mage.ai<a class="hash-link" href="#the-best-alternative-to-airlfow-mageai" title="Direct link to heading">#</a></h2><p><strong>Free hands-on tutorial:</strong> <a href="https://youtu.be/3gXsFEC3aYA" target="_blank" rel="noopener noreferrer">Watch on YouTube</a></p><p>In this insightful video, Andreas introduces Mage, a promising alternative to Apache Airflow, focusing on its simplicity, user-friendliness, and scalability. The video provides a comprehensive walkthrough of Mage, highlighting its key features and advantages over Airflow. Here&#x27;s a breakdown of what viewers can learn and expect from the video:</p><ol><li><strong>Deployment Ease:</strong> Mage offers a stark contrast to Airflow&#x27;s complex setup process. It simplifies deployment to a single Docker image, making it straightforward to install and start on any machine, whether it&#x27;s local or cloud-based on AWS, GCP, or Azure. This simplicity extends to scaling, which Mage handles horizontally, particularly beneficial in Kubernetes environments where performance scales with the number of pipelines.</li><li><strong>User Interface (UI):</strong> Mage shines with its UI, presenting a dark mode interface that&#x27;s not only visually appealing but also simplifies navigation and pipeline management. The UI facilitates easy access to pipelines, scheduling, and monitoring of pipeline runs, offering a more intuitive experience compared to Airflow.</li><li><strong>Pipeline Creation and Modification:</strong> Mage streamlines the creation of ETL pipelines, allowing users to easily add data loaders, transformers, and exporters through its UI. It supports direct interaction with APIs for data loading and provides a visual representation of the data flow, enhancing the overall pipeline design experience.</li><li><strong>Data Visualization and Exploration:</strong> Beyond simple pipeline creation, Mage enables in-depth data exploration within the UI. Users can generate various charts, such as histograms and bar charts, to analyze the data directly, a feature that greatly enhances the tool&#x27;s utility.</li><li><strong>Testing and Scheduling:</strong> Testing pipelines in Mage is straightforward, allowing for quick integration of tests to ensure data quality and pipeline reliability. Scheduling is also versatile, supporting standard time-based triggers, event-based triggers for real-time data ingestion, and API calls for on-demand pipeline execution.</li><li><strong>Support for Streaming and ELT Processes:</strong> Mage is not limited to ETL workflows but also supports streaming and ELT processes. It integrates seamlessly with DBT models for in-warehouse transformations and Spark for big data processing, showcasing its versatility and scalability.</li><li><strong>Conclusion and Call to Action:</strong> Andreas concludes by praising the direction in which the industry is moving, with tools like Mage simplifying data engineering processes. He encourages viewers to try Mage and engage with the content by liking, subscribing, and commenting on their current tools and the potential impact of Mage.</li></ol><p>Overall, the video shows Mage as a highly user-friendly, scalable, and versatile tool for data pipeline creation and management, offering a compelling alternative to traditional tools like Airflow.</p></div></article><div class="margin-vert--lg"><nav class="pagination-nav" aria-label="Blog list page navigation"><div class="pagination-nav__item"><a class="pagination-nav__link" href="/docs/03-AdvancedSkills"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">« 03-AdvancedSkills</div></a></div><div class="pagination-nav__item pagination-nav__item--next"><a class="pagination-nav__link" href="/docs/05-CaseStudies"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">05-CaseStudies »</div></a></div></nav></div></div></div><div class="col col--3"><div class="tableOfContents_35-E thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#contents" class="table-of-contents__link">Contents</a></li><li><a href="#genai-retrieval-augmented-generation-with-ollama-and-elasticsearch" class="table-of-contents__link">GenAI Retrieval Augmented Generation with Ollama and ElasticSearch</a><ul><li><a href="#install-ollama" class="table-of-contents__link">Install Ollama</a></li><li><a href="#install-elasticsearch--kibana-docker" class="table-of-contents__link">Install Elasticsearch &amp; Kibana (Docker)</a></li><li><a href="#setup-the-virtual-python-environment" class="table-of-contents__link">Setup the virtual Python environment</a></li><li><a href="#install-required-libraries-do-one-at-a-time-so-you-see-errors" class="table-of-contents__link">Install required libraries (do one at a time so you see errors):</a></li><li><a href="#write-the-data-to-elasticsearch" class="table-of-contents__link">Write the data to Elasticsearch</a></li><li><a href="#check-the-data-in-elasticsearch" class="table-of-contents__link">Check the data in Elasticsearch</a></li><li><a href="#query-data-from-elasticsearch-and-create-an-output-with-mistral" class="table-of-contents__link">Query data from elasticsearch and create an output with Mistral</a></li><li><a href="#install-libraries-to-extract-text-from-pdfs" class="table-of-contents__link">Install libraries to extract text from pdfs</a></li><li><a href="#extract-data-from-cv-and-put-it-into-elasticsearch" class="table-of-contents__link">Extract data from CV and put it into Elasticsearch</a></li></ul></li><li><a href="#free-data-engineering-course-with-aws-tdengine-docker-and-grafana" class="table-of-contents__link">Free Data Engineering Course with AWS TDengine Docker and Grafana</a></li><li><a href="#monitor-your-data-in-dbt-and-detect-quality-issues-with-elementary" class="table-of-contents__link">Monitor your data in dbt and detect quality issues with Elementary</a></li><li><a href="#solving-engineers-4-biggest-airflow-problems" class="table-of-contents__link">Solving Engineers 4 Biggest Airflow Problems</a></li><li><a href="#the-best-alternative-to-airlfow-mageai" class="table-of-contents__link">The best alternative to Airlfow? Mage.ai</a></li></ul></div></div></div></div></main></div></div><footer class="footer"><div class="container"><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2024 Andreas Kretz. Built by Kristijan Bakaric with Docusaurus.</div></div></div></footer></div>
+<a href="https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/cvpipeline.py" target="_blank" rel="noopener noreferrer">https://github.com/andkret/Cookbook/blob/master/Code%20Examples/GenAI-RAG/cvpipeline.py</a></p><p>I&#x27;ll keep developing this and update it once it&#x27;s working.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="free-data-engineering-course-with-aws-tdengine-docker-and-grafana"></a>Free Data Engineering Course with AWS TDengine Docker and Grafana<a class="hash-link" href="#free-data-engineering-course-with-aws-tdengine-docker-and-grafana" title="Direct link to heading">#</a></h2><p><strong>Free hands-on course:</strong> <a href="https://youtu.be/eoj-CnrR9jA" target="_blank" rel="noopener noreferrer">Watch on YouTube</a></p><p>In this detailed tutorial video, Andreas guides viewers through creating an end-to-end data pipeline using time series data. The project focuses on fetching weather data from a Weather API, processing it on AWS, storing it in TDengine (a time series database), and visualizing the data with Grafana. Here&#x27;s a concise summary of what the video covers:</p><ol><li><strong>Introduction and Setup:</strong></li></ol><ul><li>The project is introduced along with a GitHub repository containing all necessary resources and a step-by-step guide.</li><li>The pipeline architecture includes an IoT weather station, a Weather API, AWS for processing, TDengine for data storage, and Grafana for visualization.</li></ul><ol start="2"><li><strong>Project Components:</strong></li></ol><ul><li><strong>Weather API:</strong> Utilizes weatherapi.com to fetch weather data.</li><li><strong>AWS Lambda:</strong> Processes the data fetched from the Weather API.</li><li><strong>TDengine:</strong> Serves as the time series database to store processed data. It&#x27;s highlighted for its performance and simplicity, especially for time series data.</li><li><strong>Grafana:</strong> Used for creating dashboards to visualize the time series data.</li></ul><ol start="3"><li><strong>Development and Deployment:</strong></li></ol><ul><li>The local development environment setup includes Python, Docker, and VS Code.</li><li>The tutorial covers the creation of a Docker image for the project and deploying it to AWS Elastic Container Registry (ECR).</li><li>AWS Lambda is then configured to use the Docker image from ECR.</li><li>AWS EventBridge is used to schedule the Lambda function to run at specified intervals.</li></ul><ol start="4"><li><strong>Time Series Data:</strong></li></ol><ul><li>The importance of time series data and the benefits of using a time series database like TDengine over traditional relational databases are discussed.</li><li>TDengine&#x27;s features such as speed, scaling, data retention, and built-in functions for time series data are highlighted.</li></ul><ol start="5"><li><strong>Building the Pipeline:</strong></li></ol><ul><li>Detailed instructions are provided for setting up each component of the pipeline:<ul><li>Fetching weather data from the Weather API.</li><li>Processing and sending the data to TDengine using an AWS Lambda function.</li><li>Visualizing the data with Grafana.</li></ul></li><li>Each step includes code snippets and configurations needed to implement the pipeline.</li></ul><ol start="6"><li><strong>Conclusion:</strong></li></ol><ul><li>The video concludes with a demonstration of the completed pipeline, showing weather data visualizations in Grafana.</li><li>Viewers are encouraged to replicate the project using the resources provided in the GitHub repository linked in the video description.</li></ul><p>This video provides a comprehensive guide to building a data pipeline with a focus on time series data, demonstrating the integration of various technologies and platforms to achieve an end-to-end solution.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="monitor-your-data-in-dbt-and-detect-quality-issues-with-elementary"></a>Monitor your data in dbt and detect quality issues with Elementary<a class="hash-link" href="#monitor-your-data-in-dbt-and-detect-quality-issues-with-elementary" title="Direct link to heading">#</a></h2><p><strong>Free hands-on tutorial:</strong> <a href="https://youtu.be/6fnU91Q2gq0" target="_blank" rel="noopener noreferrer">Watch on YouTube</a></p><p>In this comprehensive tutorial, Andreas delves into the integration of dbt (data build tool) with Elementary to enhance data monitoring and quality detection within Snowflake databases. The tutorial is structured to guide viewers through a hands-on experience, starting with an introduction to a sample project setup and the common challenges faced in monitoring dbt jobs. It then transitions into how Elementary can be utilized to address these challenges effectively.</p><p>Key learning points and tutorial structure include:</p><ol><li><strong>Introduction to the Sample Project:</strong> Andreas showcases a project setup involving Snowflake as the data warehouse, dbt for data modeling and testing, and a visualization tool for data analysis. This setup serves as the basis for the tutorial.</li><li><strong>Challenges in Monitoring dbt Jobs:</strong> Common issues in monitoring dbt jobs are discussed, highlighting the limitations of the dbt interface in providing comprehensive monitoring capabilities.</li><li><strong>Introduction to Elementary:</strong> Elementary is introduced as a dbt-native data observability tool designed to enhance the monitoring and analysis of dbt jobs. It offers both open-source and cloud versions, with the tutorial focusing on the cloud version.</li><li><strong>Setup Requirements:</strong> The tutorial covers the necessary setup on both the Snowflake and dbt sides, including schema creation, user and role configuration in Snowflake, and modifications to the dbt project for integrating with Elementary.</li><li><strong>Elementary&#x27;s User Interface and Features:</strong> A thorough walkthrough of Elementary&#x27;s interface is provided, showcasing its dashboard, test results, model runs, data catalog, and data lineage features. The tool&#x27;s ability to automatically run additional tests, like anomaly detection and schema change detection, is also highlighted.</li><li><strong>Advantages of Using Elementary:</strong> The presenter outlines several benefits of using Elementary, such as easy implementation, native test integration, clean and straightforward UI, and enhanced privacy due to data being stored within the user&#x27;s data warehouse.</li><li><strong>Potential Drawbacks:</strong> Some potential drawbacks are discussed, including the additional load on dbt job execution due to more models being run and limitations in dashboard customization.</li><li><strong>Summary and Verdict:</strong> The tutorial concludes with a summary of the key features and benefits of using Elementary with dbt, emphasizing its value in improving data quality monitoring and detection.</li></ol><p>Overall, viewers are guided through setting up and utilizing Elementary for dbt data monitoring, gaining insights into its capabilities, setup process, and the practical benefits it offers for data quality assurance.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="solving-engineers-4-biggest-airflow-problems"></a>Solving Engineers 4 Biggest Airflow Problems<a class="hash-link" href="#solving-engineers-4-biggest-airflow-problems" title="Direct link to heading">#</a></h2><p><strong>Free hands-on tutorial:</strong> <a href="https://youtu.be/b9bMNEh8bes" target="_blank" rel="noopener noreferrer">Watch on YouTube</a></p><p>In this informative video, Andreas discusses the four major challenges engineers face when working with Apache Airflow and introduces Astronomer, a managed Airflow service that addresses these issues effectively. Astronomer is highlighted as a solution that simplifies Airflow deployment and management, making it easier for engineers to develop, deploy, and monitor their data pipelines. Here&#x27;s a summary of the key points discussed for each challenge and how Astronomer provides solutions:</p><ol><li>Managing Airflow Deployments:</li></ol><ul><li><strong>Challenge:</strong> Setting up and maintaining Airflow deployments is complex and time-consuming, involving configuring cloud instances, managing resources, scaling, and updating the Airflow system.</li><li><strong>Solution with Astronomer:</strong> Offers a straightforward deployment process where users can easily configure their deployments, choose cloud providers (GCP, AWS, Azure), and set up scaling with just a few clicks. Astronomer handles the complexity, making it easier to manage production and quality environments.</li></ul><ol start="2"><li>Development Environment and Deployment:</li></ol><ul><li><strong>Challenge:</strong> Local installation of Airflow is complicated due to its dependency on multiple Docker containers and the need for extensive configuration.</li><li><strong>Solution with Astronomer:</strong> Provides a CLI tool for setting up a local development environment with a single command, simplifying the process of developing, testing, and deploying pipelines. The Astronomer CLI also helps in initializing project templates and deploying Dags to the cloud effortlessly.</li></ul><ol start="3"><li>Source Code Management and CI/CD Pipelines:</li></ol><ul><li><strong>Challenge:</strong> Collaborative development and continuous integration/deployment (CI/CD) are essential but challenging to implement effectively with Airflow alone.</li><li><strong>Solution with Astronomer:</strong> Facilitates easy integration with GitHub for source code management and GitHub Actions for CI/CD. This allows automatic testing and deployment of pipeline code, ensuring a smooth workflow for teams working on pipeline development.</li></ul><ol start="4"><li>Observing Pipelines and Alarms:</li></ol><ul><li><strong>Challenge:</strong> Monitoring data pipelines and getting timely alerts when issues occur is crucial but often difficult to achieve.</li><li><strong>Solution with Astronomer:</strong> The Astronomer platform provides a user-friendly interface for monitoring pipeline status and performance. It also offers customizable alerts for failures or prolonged task durations, with notifications via email, PagerDuty, or Slack, ensuring immediate awareness and response to issues.</li></ul><p>Overall, the video shows Astronomer as a powerful and user-friendly platform that addresses the common challenges of using Airflow, from deployment and development to collaboration, CI/CD, and monitoring. It suggests that Astronomer can significantly improve the experience of engineers working with Airflow, making it easier to manage, develop, and monitor data pipelines.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="the-best-alternative-to-airlfow-mageai"></a>The best alternative to Airlfow? Mage.ai<a class="hash-link" href="#the-best-alternative-to-airlfow-mageai" title="Direct link to heading">#</a></h2><p><strong>Free hands-on tutorial:</strong> <a href="https://youtu.be/3gXsFEC3aYA" target="_blank" rel="noopener noreferrer">Watch on YouTube</a></p><p>In this insightful video, Andreas introduces Mage, a promising alternative to Apache Airflow, focusing on its simplicity, user-friendliness, and scalability. The video provides a comprehensive walkthrough of Mage, highlighting its key features and advantages over Airflow. Here&#x27;s a breakdown of what viewers can learn and expect from the video:</p><ol><li><strong>Deployment Ease:</strong> Mage offers a stark contrast to Airflow&#x27;s complex setup process. It simplifies deployment to a single Docker image, making it straightforward to install and start on any machine, whether it&#x27;s local or cloud-based on AWS, GCP, or Azure. This simplicity extends to scaling, which Mage handles horizontally, particularly beneficial in Kubernetes environments where performance scales with the number of pipelines.</li><li><strong>User Interface (UI):</strong> Mage shines with its UI, presenting a dark mode interface that&#x27;s not only visually appealing but also simplifies navigation and pipeline management. The UI facilitates easy access to pipelines, scheduling, and monitoring of pipeline runs, offering a more intuitive experience compared to Airflow.</li><li><strong>Pipeline Creation and Modification:</strong> Mage streamlines the creation of ETL pipelines, allowing users to easily add data loaders, transformers, and exporters through its UI. It supports direct interaction with APIs for data loading and provides a visual representation of the data flow, enhancing the overall pipeline design experience.</li><li><strong>Data Visualization and Exploration:</strong> Beyond simple pipeline creation, Mage enables in-depth data exploration within the UI. Users can generate various charts, such as histograms and bar charts, to analyze the data directly, a feature that greatly enhances the tool&#x27;s utility.</li><li><strong>Testing and Scheduling:</strong> Testing pipelines in Mage is straightforward, allowing for quick integration of tests to ensure data quality and pipeline reliability. Scheduling is also versatile, supporting standard time-based triggers, event-based triggers for real-time data ingestion, and API calls for on-demand pipeline execution.</li><li><strong>Support for Streaming and ELT Processes:</strong> Mage is not limited to ETL workflows but also supports streaming and ELT processes. It integrates seamlessly with DBT models for in-warehouse transformations and Spark for big data processing, showcasing its versatility and scalability.</li><li><strong>Conclusion and Call to Action:</strong> Andreas concludes by praising the direction in which the industry is moving, with tools like Mage simplifying data engineering processes. He encourages viewers to try Mage and engage with the content by liking, subscribing, and commenting on their current tools and the potential impact of Mage.</li></ol><p>Overall, the video shows Mage as a highly user-friendly, scalable, and versatile tool for data pipeline creation and management, offering a compelling alternative to traditional tools like Airflow.</p></div></article><div class="margin-vert--lg"><nav class="pagination-nav" aria-label="Blog list page navigation"><div class="pagination-nav__item"><a class="pagination-nav__link" href="/docs/03-AdvancedSkills"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">« 03-AdvancedSkills</div></a></div><div class="pagination-nav__item pagination-nav__item--next"><a class="pagination-nav__link" href="/docs/05-CaseStudies"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">05-CaseStudies »</div></a></div></nav></div></div></div><div class="col col--3"><div class="tableOfContents_35-E thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#contents" class="table-of-contents__link">Contents</a></li><li><a href="#genai-retrieval-augmented-generation-with-ollama-and-elasticsearch" class="table-of-contents__link">GenAI Retrieval Augmented Generation with Ollama and Elasticsearch</a><ul><li><a href="#install-ollama" class="table-of-contents__link">Install Ollama</a></li><li><a href="#install-elasticsearch--kibana-docker" class="table-of-contents__link">Install Elasticsearch &amp; Kibana (Docker)</a></li><li><a href="#setup-the-virtual-python-environment" class="table-of-contents__link">Setup the virtual Python environment</a></li><li><a href="#install-required-libraries-do-one-at-a-time-so-you-see-errors" class="table-of-contents__link">Install required libraries (do one at a time so you see errors):</a></li><li><a href="#write-the-data-to-elasticsearch" class="table-of-contents__link">Write the data to Elasticsearch</a></li><li><a href="#check-the-data-in-elasticsearch" class="table-of-contents__link">Check the data in Elasticsearch</a></li><li><a href="#query-data-from-elasticsearch-and-create-an-output-with-mistral" class="table-of-contents__link">Query data from elasticsearch and create an output with Mistral</a></li><li><a href="#install-libraries-to-extract-text-from-pdfs" class="table-of-contents__link">Install libraries to extract text from pdfs</a></li><li><a href="#extract-data-from-cv-and-put-it-into-elasticsearch" class="table-of-contents__link">Extract data from CV and put it into Elasticsearch</a></li></ul></li><li><a href="#free-data-engineering-course-with-aws-tdengine-docker-and-grafana" class="table-of-contents__link">Free Data Engineering Course with AWS TDengine Docker and Grafana</a></li><li><a href="#monitor-your-data-in-dbt-and-detect-quality-issues-with-elementary" class="table-of-contents__link">Monitor your data in dbt and detect quality issues with Elementary</a></li><li><a href="#solving-engineers-4-biggest-airflow-problems" class="table-of-contents__link">Solving Engineers 4 Biggest Airflow Problems</a></li><li><a href="#the-best-alternative-to-airlfow-mageai" class="table-of-contents__link">The best alternative to Airlfow? Mage.ai</a></li></ul></div></div></div></div></main></div></div><footer class="footer"><div class="container"><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2024 Andreas Kretz. Built by Kristijan Bakaric with Docusaurus.</div></div></div></footer></div>
 <script src="/assets/js/styles.4aacfb65.js"></script>
-<script src="/assets/js/runtime~main.33a05ad1.js"></script>
+<script src="/assets/js/runtime~main.96f9bfc8.js"></script>
 <script src="/assets/js/main.da980c98.js"></script>
 <script src="/assets/js/1.4743d043.js"></script>
 <script src="/assets/js/2.d8ee9dd6.js"></script>
@@ -47,6 +47,6 @@
 <script src="/assets/js/19.f0bdf1f6.js"></script>
 <script src="/assets/js/935f2afb.cff03dc6.js"></script>
 <script src="/assets/js/17896441.aacbb830.js"></script>
-<script src="/assets/js/b42f3af8.d55ebf91.js"></script>
+<script src="/assets/js/b42f3af8.f460278c.js"></script>
 </body>
 </html>
\ No newline at end of file
diff --git a/docs/05-CaseStudies/index.html b/docs/05-CaseStudies/index.html
index f733273..597a99f 100644
--- a/docs/05-CaseStudies/index.html
+++ b/docs/05-CaseStudies/index.html
@@ -7,7 +7,7 @@
 <link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="THE DATA ENGINEERING COOKBOOK Blog RSS Feed">
 <link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="THE DATA ENGINEERING COOKBOOK Blog Atom Feed"><title data-react-helmet="true">05-CaseStudies | THE DATA ENGINEERING COOKBOOK</title><meta data-react-helmet="true" name="twitter:card" content="summary_large_image"><meta data-react-helmet="true" name="docusaurus_locale" content="en"><meta data-react-helmet="true" name="docusaurus_version" content="current"><meta data-react-helmet="true" name="docusaurus_tag" content="docs-default-current"><meta data-react-helmet="true" property="og:title" content="05-CaseStudies | THE DATA ENGINEERING COOKBOOK"><meta data-react-helmet="true" name="description" content="Case Studies"><meta data-react-helmet="true" property="og:description" content="Case Studies"><meta data-react-helmet="true" property="og:url" content="http://cookbook.learndataengineering.com/docs/05-CaseStudies"><link data-react-helmet="true" rel="shortcut icon" href="/images/CookbookCover.jpg"><link data-react-helmet="true" rel="alternate" href="http://cookbook.learndataengineering.com/docs/05-CaseStudies" hreflang="x-default"><link data-react-helmet="true" rel="canonical" href="http://cookbook.learndataengineering.com/docs/05-CaseStudies"><link rel="stylesheet" href="/assets/css/styles.5ba6ab3f.css">
 <link rel="preload" href="/assets/js/styles.4aacfb65.js" as="script">
-<link rel="preload" href="/assets/js/runtime~main.33a05ad1.js" as="script">
+<link rel="preload" href="/assets/js/runtime~main.96f9bfc8.js" as="script">
 <link rel="preload" href="/assets/js/main.da980c98.js" as="script">
 <link rel="preload" href="/assets/js/1.4743d043.js" as="script">
 <link rel="preload" href="/assets/js/2.d8ee9dd6.js" as="script">
@@ -131,7 +131,7 @@
 <a href="https://databricks.com/session/continuous-applications-at-scale-of-100-teams-with-databricks-delta-and-structured-streaming" target="_blank" rel="noopener noreferrer">https://databricks.com/session/continuous-applications-at-scale-of-100-teams-with-databricks-delta-and-structured-streaming</a></p><p>Talk at Strata London slides:
 <a href="https://databricks.com/session/continuous-applications-at-scale-of-100-teams-with-databricks-delta-and-structured-streaming" target="_blank" rel="noopener noreferrer">https://databricks.com/session/continuous-applications-at-scale-of-100-teams-with-databricks-delta-and-structured-streaming</a></p><p><a href="https://jobs.zalando.com/tech/blog/what-is-hardcore-data-science--in-practice/?gh_src=4n3gxh1" target="_blank" rel="noopener noreferrer">https://jobs.zalando.com/tech/blog/what-is-hardcore-data-science--in-practice/?gh_src=4n3gxh1</a></p><p><a href="https://jobs.zalando.com/tech/blog/complex-event-generation-for-business-process-monitoring-using-apache-flink/" target="_blank" rel="noopener noreferrer">https://jobs.zalando.com/tech/blog/complex-event-generation-for-business-process-monitoring-using-apache-flink/</a></p></div></article><div class="margin-vert--lg"><nav class="pagination-nav" aria-label="Blog list page navigation"><div class="pagination-nav__item"><a class="pagination-nav__link" href="/docs/04-HandsOnCourse"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">« 04-HandsOnCourse</div></a></div><div class="pagination-nav__item pagination-nav__item--next"><a class="pagination-nav__link" href="/docs/06-BestPracticesCloud"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">06-BestPracticesCloud »</div></a></div></nav></div></div></div><div class="col col--3"><div class="tableOfContents_35-E thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#contents" class="table-of-contents__link">Contents</a></li><li><a href="#how-i-do-case-studies" class="table-of-contents__link">How I do Case Studies</a><ul><li><a href="#data-science-at-airbnb" class="table-of-contents__link">Data Science at Airbnb</a></li><li><a href="#data-science-at-amazon" class="table-of-contents__link">Data Science at Amazon</a></li><li><a href="#data-science-at-baidu" class="table-of-contents__link">Data Science at Baidu</a></li><li><a href="#data-science-at-blackrock" class="table-of-contents__link">Data Science at Blackrock</a></li><li><a href="#data-science-at-bmw" class="table-of-contents__link">Data Science at BMW</a></li><li><a href="#data-science-at-bookingcom" class="table-of-contents__link">Data Science at Booking.com</a></li><li><a href="#data-science-at-cern" class="table-of-contents__link">Data Science at CERN</a></li><li><a href="#data-science-at-disney" class="table-of-contents__link">Data Science at Disney</a></li><li><a href="#data-science-at-dlr" class="table-of-contents__link">Data Science at DLR</a></li><li><a href="#data-science-at-drivetribe" class="table-of-contents__link">Data Science at Drivetribe</a></li><li><a href="#data-science-at-dropbox" class="table-of-contents__link">Data Science at Dropbox</a></li><li><a href="#data-science-at-ebay" class="table-of-contents__link">Data Science at Ebay</a></li><li><a href="#data-science-at-expedia" class="table-of-contents__link">Data Science at Expedia</a></li><li><a href="#data-science-at-facebook" class="table-of-contents__link">Data Science at Facebook</a></li><li><a href="#data-science-at-google" class="table-of-contents__link">Data Science at Google</a></li><li><a href="#data-science-at-grammarly" class="table-of-contents__link">Data Science at Grammarly</a></li><li><a href="#data-science-at-ing-fraud" class="table-of-contents__link">Data Science at ING Fraud</a></li><li><a href="#data-science-at-instagram" class="table-of-contents__link">Data Science at Instagram</a></li><li><a href="#data-science-at-linkedin" class="table-of-contents__link">Data Science at LinkedIn</a></li><li><a href="#data-science-at-lyft" class="table-of-contents__link">Data Science at Lyft</a></li><li><a href="#data-science-at-nasa" class="table-of-contents__link">Data Science at NASA</a></li><li><a href="#data-science-at-netflix" class="table-of-contents__link">Data Science at Netflix</a></li><li><a href="#data-science-at-olx" class="table-of-contents__link">Data Science at OLX</a></li><li><a href="#data-science-at-otto" class="table-of-contents__link">Data Science at OTTO</a></li><li><a href="#data-science-at-paypal" class="table-of-contents__link">Data Science at Paypal</a></li><li><a href="#data-science-at-pinterest" class="table-of-contents__link">Data Science at Pinterest</a></li><li><a href="#data-science-at-salesforce" class="table-of-contents__link">Data Science at Salesforce</a></li><li><a href="#data-science-at-siemens-mindsphere" class="table-of-contents__link">Data Science at Siemens Mindsphere</a></li><li><a href="#data-science-at-slack" class="table-of-contents__link">Data Science at Slack</a></li><li><a href="#data-science-at-spotify" class="table-of-contents__link">Data Science at Spotify</a></li><li><a href="#data-science-at-symantec" class="table-of-contents__link">Data Science at Symantec</a></li><li><a href="#data-science-at-tinder" class="table-of-contents__link">Data Science at Tinder</a></li><li><a href="#data-science-at-twitter" class="table-of-contents__link">Data Science at Twitter</a></li><li><a href="#data-science-at-uber" class="table-of-contents__link">Data Science at Uber</a></li><li><a href="#data-science-at-upwork" class="table-of-contents__link">Data Science at Upwork</a></li><li><a href="#data-science-at-woot" class="table-of-contents__link">Data Science at Woot</a></li><li><a href="#data-science-at-zalando" class="table-of-contents__link">Data Science at Zalando</a></li></ul></li></ul></div></div></div></div></main></div></div><footer class="footer"><div class="container"><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2024 Andreas Kretz. Built by Kristijan Bakaric with Docusaurus.</div></div></div></footer></div>
 <script src="/assets/js/styles.4aacfb65.js"></script>
-<script src="/assets/js/runtime~main.33a05ad1.js"></script>
+<script src="/assets/js/runtime~main.96f9bfc8.js"></script>
 <script src="/assets/js/main.da980c98.js"></script>
 <script src="/assets/js/1.4743d043.js"></script>
 <script src="/assets/js/2.d8ee9dd6.js"></script>
diff --git a/docs/06-BestPracticesCloud/index.html b/docs/06-BestPracticesCloud/index.html
index 4c40c5e..37ab405 100644
--- a/docs/06-BestPracticesCloud/index.html
+++ b/docs/06-BestPracticesCloud/index.html
@@ -7,7 +7,7 @@
 <link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="THE DATA ENGINEERING COOKBOOK Blog RSS Feed">
 <link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="THE DATA ENGINEERING COOKBOOK Blog Atom Feed"><title data-react-helmet="true">06-BestPracticesCloud | THE DATA ENGINEERING COOKBOOK</title><meta data-react-helmet="true" name="twitter:card" content="summary_large_image"><meta data-react-helmet="true" name="docusaurus_locale" content="en"><meta data-react-helmet="true" name="docusaurus_version" content="current"><meta data-react-helmet="true" name="docusaurus_tag" content="docs-default-current"><meta data-react-helmet="true" property="og:title" content="06-BestPracticesCloud | THE DATA ENGINEERING COOKBOOK"><meta data-react-helmet="true" name="description" content="Best Practices Cloud Platforms"><meta data-react-helmet="true" property="og:description" content="Best Practices Cloud Platforms"><meta data-react-helmet="true" property="og:url" content="http://cookbook.learndataengineering.com/docs/06-BestPracticesCloud"><link data-react-helmet="true" rel="shortcut icon" href="/images/CookbookCover.jpg"><link data-react-helmet="true" rel="alternate" href="http://cookbook.learndataengineering.com/docs/06-BestPracticesCloud" hreflang="x-default"><link data-react-helmet="true" rel="canonical" href="http://cookbook.learndataengineering.com/docs/06-BestPracticesCloud"><link rel="stylesheet" href="/assets/css/styles.5ba6ab3f.css">
 <link rel="preload" href="/assets/js/styles.4aacfb65.js" as="script">
-<link rel="preload" href="/assets/js/runtime~main.33a05ad1.js" as="script">
+<link rel="preload" href="/assets/js/runtime~main.96f9bfc8.js" as="script">
 <link rel="preload" href="/assets/js/main.da980c98.js" as="script">
 <link rel="preload" href="/assets/js/1.4743d043.js" as="script">
 <link rel="preload" href="/assets/js/2.d8ee9dd6.js" as="script">
@@ -26,7 +26,7 @@
 They are also useful for AWS and GCP, just try to change out the tools.</p><p>As always, I am going to add more stuff to this over time.</p><p>Have fun!</p><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="contents"></a>Contents<a class="hash-link" href="#contents" title="Direct link to heading">#</a></h2><ul><li><a href="/docs/06-BestPracticesCloud#aws">Amazon Web Services (AWS)</a><ul><li><a href="/docs/06-BestPracticesCloud#Connect">Connect</a></li><li><a href="/docs/06-BestPracticesCloud#Buffer">Buffer</a></li><li><a href="/docs/06-BestPracticesCloud#Processing">Processing</a></li><li><a href="/docs/06-BestPracticesCloud#Store">Store</a></li><li><a href="/docs/06-BestPracticesCloud#Visualize">Visualize</a></li><li><a href="/docs/06-BestPracticesCloud#Containerization">Containerization</a></li><li><a href="/docs/06-BestPracticesCloud#Best-Practices">Best Practices</a></li><li><a href="/docs/06-BestPracticesCloud#More-Details">More Details</a></li></ul></li><li><a href="/docs/06-BestPracticesCloud#azure">Microsoft Azure</a><ul><li><a href="/docs/06-BestPracticesCloud#Connect-1">Connect</a></li><li><a href="/docs/06-BestPracticesCloud#Buffer-1">Buffer</a></li><li><a href="/docs/06-BestPracticesCloud#Processing-1">Processing</a></li><li><a href="/docs/06-BestPracticesCloud#Store-1">Store</a></li><li><a href="/docs/06-BestPracticesCloud#Visualize-1">Visualize</a></li><li><a href="/docs/06-BestPracticesCloud#Containerization-1">Containerization</a></li><li><a href="/docs/06-BestPracticesCloud#Best-Practices-1">Best Practices</a></li></ul></li><li><a href="/docs/06-BestPracticesCloud#gcp">Google Cloud Platform (GCP)</a><ul><li><a href="/docs/06-BestPracticesCloud#Connect-2">Connect</a></li><li><a href="/docs/06-BestPracticesCloud#Buffer-2">Buffer</a></li><li><a href="/docs/06-BestPracticesCloud#Processing-2">Processing</a></li><li><a href="/docs/06-BestPracticesCloud#Store-2">Store</a></li><li><a href="/docs/06-BestPracticesCloud#Visualize-2">Visualize</a></li><li><a href="/docs/06-BestPracticesCloud#Containerization-2">Containerization</a></li><li><a href="/docs/06-BestPracticesCloud#Best-Practices-2">Best Practices</a></li></ul></li></ul><h1><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="aws"></a>AWS<a class="hash-link" href="#aws" title="Direct link to heading">#</a></h1><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="connect"></a>Connect<a class="hash-link" href="#connect" title="Direct link to heading">#</a></h2><ul><li>Elastic Beanstalk (very old)</li><li>SES Simple Email Service</li><li>API Gateway</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="buffer"></a>Buffer<a class="hash-link" href="#buffer" title="Direct link to heading">#</a></h2><ul><li>Kinesis</li><li>Kinesis Data Firehose</li><li>Managed Streaming for Kafka (MSK)</li><li>MQ</li><li>Simple Queue Service (SQS)</li><li>Simple Notification Service (SNS)</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="processing"></a>Processing<a class="hash-link" href="#processing" title="Direct link to heading">#</a></h2><ul><li>EC2</li><li>Athena</li><li>EMR</li><li>Elasticsearch</li><li>Kinesis Data Analytics</li><li>Glue</li><li>Step Functions</li><li>Fargate</li><li>Lambda</li><li>SageMaker</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="store"></a>Store<a class="hash-link" href="#store" title="Direct link to heading">#</a></h2><ul><li>Simple Storage Service (S3)</li><li>Redshift</li><li>Aurora</li><li>RDS</li><li>DynamoDB</li><li>ElastiCache</li><li>Neptune Graph DB</li><li>Timestream</li><li>DocumentDB (MongoDB compatible)</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="visualize"></a>Visualize<a class="hash-link" href="#visualize" title="Direct link to heading">#</a></h2><ul><li>Quicksight</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="containerization"></a>Containerization<a class="hash-link" href="#containerization" title="Direct link to heading">#</a></h2><ul><li>Elastic Container Service (ECS)</li><li>Elastic Container Registry (ECR)</li><li>Elastic Kubernetes Service (EKS)</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="best-practices"></a>Best Practices<a class="hash-link" href="#best-practices" title="Direct link to heading">#</a></h2><p>Deploying a Spring Boot Application on AWS Using AWS Elastic Beanstalk:</p><p><a href="https://aws.amazon.com/de/blogs/devops/deploying-a-spring-boot-application-on-aws-using-aws-elastic-beanstalk/" target="_blank" rel="noopener noreferrer">https://aws.amazon.com/de/blogs/devops/deploying-a-spring-boot-application-on-aws-using-aws-elastic-beanstalk/</a></p><p>How to deploy a Docker Container on AWS:</p><p><a href="https://aws.amazon.com/getting-started/hands-on/deploy-docker-containers/" target="_blank" rel="noopener noreferrer">https://aws.amazon.com/getting-started/hands-on/deploy-docker-containers/</a></p><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="aws-platform-architecture-for-genai"></a>AWS platform architecture for GenAI<a class="hash-link" href="#aws-platform-architecture-for-genai" title="Direct link to heading">#</a></h4><p><img alt="Imagetitle" src="/assets/images/genai-enterprise-bf79e003f8d44e5feab0e44999fd7907.png">
 ▶ <a href="https://youtu.be/2yX6G4ZURbc" target="_blank" rel="noopener noreferrer">Click here to watch</a></p><p>I recorded a reaction video to an AWS platform architecture for GenAI called Tailwinds. Presented by John from Innovative Solutions and Josh from AWS, it has two main flows: indexing and consumer.</p><p>Data enters through S3 buckets or an API gateway, processed by AWS Lambda or Glue, and stored in a vector or graph database, then indexed in OpenSearch. Applications like chatbots use an API gateway to trigger Lambda functions for data retrieval and processing. This flexible serverless setup supports various data formats and uses tools like SAM and Terraform.</p><p>Amazon Bedrock helps customers choose and evaluate models. The architecture is flexible but requires effort to create the necessary Lambda functions. Check out the video and share your thoughts!</p><p>▶ <a href="https://youtu.be/2yX6G4ZURbc" target="_blank" rel="noopener noreferrer">Click here to watch</a></p><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="generative-ai-enabled-job-search-engine"></a>Generative AI enabled job search engine<a class="hash-link" href="#generative-ai-enabled-job-search-engine" title="Direct link to heading">#</a></h4><p><img alt="Imagetitle" src="/assets/images/job-search-71ee8a92a46b382eacb0ae8180e443b7.png"></p><p>▶ <a href="https://youtu.be/dOWqasmqfHQ" target="_blank" rel="noopener noreferrer">Click here to watch</a></p><p>Hey everyone, I recorded a reaction video to an AWS platform architecture for a Gen AI job search engine. Presented by Andrea from AWS and Bill from Healthy Careers, this setup uses generative AI to enhance job searches for healthcare professionals.</p><p>The architecture uses Elastic Container Service (ECS) to handle user queries, processed by Claude II for prompt checks and geolocation. Cleaned prompts are vectorized using Amazon&#x27;s Titan model, with user search history fetched from an SQL database. Search results are stored in Elasticsearch, updating every six hours. Finally, Claude II generates a response from the search results and sends it back to the user.</p><p>I found the use of Claude II for prompt sanitization and geolocation, and the integration of multiple AI models through AWS Bedrock, particularly interesting. This setup keeps data private and provides a flexible, efficient job search experience.</p><p>Check out the video and share your thoughts!</p><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="voice-transcription-and-analysis-on-aws"></a>Voice transcription and analysis on AWS<a class="hash-link" href="#voice-transcription-and-analysis-on-aws" title="Direct link to heading">#</a></h4><p><img alt="Imagetitle" src="/assets/images/voice-transcription-36e9b5d0eda7921d3efda6784ce48f75.png"></p><p>▶ <a href="https://youtu.be/RGXRjOTQuBM" target="_blank" rel="noopener noreferrer">Click here to watch</a></p><p>Hey everyone, I recorded a reaction video to an AWS architecture for voice transcription and analysis. Presented by Nuan from AWS and Ben from Assembly AI, this system is designed to handle large-scale audio data processing.</p><p>Users upload audio data via an API to an ECS container. The data is then managed by an orchestrator that decides which models to use and in what order. The orchestrator sends tasks to SQS, which triggers various ML models running on ECS. These models handle tasks like speech-to-text conversion, sentiment analysis, and speaker labeling. Results are stored in S3 and users are notified via SNS and a Lambda function when processing is complete.</p><p>I found the use of ECS for containerized applications and the flexibility of swapping models through ECR particularly interesting. This architecture ensures scalability and efficiency, making it ideal for handling millions of requests per day.</p><p>Check out the video and share your thoughts!</p><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="geospatial-data-analysis"></a>GeoSpatial Data Analysis<a class="hash-link" href="#geospatial-data-analysis" title="Direct link to heading">#</a></h4><p><img alt="Imagetitle" src="/assets/images/geo-spacial-9a602f542c5c0b803268c0a1218d3a61.png"></p><p>▶ <a href="https://youtu.be/MxVJAvFSTXg" target="_blank" rel="noopener noreferrer">Click here to watch</a></p><p>Hey everyone, I recorded a reaction video to an AWS architecture for geospatial data analysis by TCS. Presented by David John and Suryakant from TCS, this platform is used in next-gen agriculture for tasks like crop health, yield, and soil moisture analysis.</p><p>The platform uses data from satellites, AWS open data, and field agents, processing it with Lambda, Sagemaker, and PostgreSQL. Data is stored and analyzed in S3 buckets and PostgreSQL, with results made accessible via EKS-deployed UIs on EC2 instances, buffered through CloudFront for efficiency.</p><p>Key aspects include:</p><ul><li>Lambda functions triggering Sagemaker jobs for machine learning.</li><li>Sagemaker handling extensive processing tasks.</li><li>PostgreSQL and S3 for storing processed data.</li><li>CloudFront caching data to enhance user experience.</li><li>I found the use of parallel Sagemaker jobs for scalability and the integration of open data for cost efficiency particularly interesting. This setup effectively meets the agricultural sector&#x27;s data analysis needs.</li></ul><p>Check out the video and share your thoughts!</p><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="building-a-self-service-enterprise-data-engineering-platform"></a>Building a Self-Service Enterprise Data Engineering Platform<a class="hash-link" href="#building-a-self-service-enterprise-data-engineering-platform" title="Direct link to heading">#</a></h4><p><img alt="Imagetitle" src="/assets/images/enterprise-solution-0962f5f3190a6ebf7433f8e46988360e.png"></p><p>▶ <a href="https://youtu.be/E9JFCl7bk88" target="_blank" rel="noopener noreferrer">Click here to watch</a></p><p>Hey everyone, I recorded a reaction video to an AWS architecture for a self-service enterprise data engineering platform by ZS Associates. Presented by David John and Laken from ZS Associates, this platform is designed to streamline data integration, infrastructure provisioning, and data access for life sciences companies.</p><p>Key components:</p><ul><li><strong>Users and Interaction</strong>: Data engineers and analysts interact through a self-service web portal, selecting infrastructure types and providing project details. This portal makes REST requests to EKS, which creates records in PostgreSQL and triggers infrastructure provisioning via SQS.</li><li><strong>Infrastructure Provisioning</strong>: EKS processes SQS messages to provision infrastructure such as EMR clusters, databases in Glue Catalog, S3 buckets, and EC2 instances with containerized services like Airflow or NiFi. IAM roles are configured for access control.</li><li><strong>Data Governance and Security</strong>: All data sets are accessed through the Glue Catalog, with governance workflows requiring approval from data owners via SES notifications. EKS updates IAM roles and Ranger policies for fine-grained access control.</li><li><strong>Scalability and Efficiency</strong>: EKS hosts 100+ microservices supporting workflows and UI portals. The platform handles millions of API requests and hundreds of data access requests monthly, with auto-scaling capabilities to manage costs.</li></ul><p>This architecture effectively reduces time to market, enhances security at scale, and optimizes costs by automating data access and infrastructure provisioning. It also ensures data governance and security through controlled access and approval processes.</p><p>Check out the video and share your thoughts!</p><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="customer-support-platform"></a>Customer Support Platform<a class="hash-link" href="#customer-support-platform" title="Direct link to heading">#</a></h4><p><img alt="Imagetitle" src="/assets/images/customer-support-cb6705d1e89738592f41f29603c961f6.png"></p><p>▶ <a href="https://youtu.be/sCIFpOuryFU" target="_blank" rel="noopener noreferrer">Click here to watch</a></p><p>Hey everyone, I recorded a reaction video to an AWS architecture for a personalized customer support platform by Traeger. Presented by David John and Lizzy from Traeger, this system enhances customer support by leveraging data from Shopify, EventBridge, Kinesis Data Firehose, S3, Lambda, DynamoDB, and Amazon Connect.</p><p>Key components:</p><ul><li><strong>Order Processing</strong>: Customer order data from Shopify flows into EventBridge, then to Kinesis Data Firehose, which writes it to S3. An event trigger in S3 invokes a Lambda function that stores specific order metadata in DynamoDB.</li><li><strong>Personalized Customer Support</strong>: When a customer calls, Amazon Connect uses Pinpoint to determine the call&#x27;s origin, personalizing the language options. Connect triggers a Lambda function to query DynamoDB for customer metadata based on the phone number. This data is used to inform the customer support agent.</li><li><strong>Reason for Contact</strong>: Amazon Lex bot asks the customer the reason for their call, and this information, along with customer metadata, routes the call to a specialized support queue.</li></ul><p>I found the use of DynamoDB for storing customer metadata and the integration with Amazon Connect and Lex for personalized support particularly interesting. The architecture is scalable and ensures a personalized experience for customers.</p><p>Check out the video and share your thoughts!</p><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="league-of-legends-data-platform-on-aws"></a>League of Legends Data Platform on AWS<a class="hash-link" href="#league-of-legends-data-platform-on-aws" title="Direct link to heading">#</a></h4><p><img alt="Imagetitle" src="/assets/images/league-c90677d5e78008d1a4b90fa9fc53388c.jpg"></p><p>▶ <a href="https://youtu.be/FX_ZUJk_WoE" target="_blank" rel="noopener noreferrer">Click here to watch</a></p><p>Hey everyone, I recorded a reaction video to an AWS architecture for the data platform that powers League of Legends by Riot Games. Presented by David John and the team at Riot Games, this system handles massive amounts of data generated by millions of players worldwide.</p><p>Key components:</p><ul><li><strong>Player Interaction</strong>: Players connect to game servers globally. The game client communicates with an API running in EKS. This setup ensures low latency and optimal performance.</li><li><strong>Data Ingestion</strong>: The game client and server send data about player interactions to EKS, which flows into MSK (Managed Streaming for Kafka). Local Kafka clusters buffer the data before it’s replicated to regional MSK clusters using MirrorMaker.</li><li><strong>Data Processing</strong>: Spark Streaming jobs process the data from MSK and store it in Delta Lake on S3. This setup ensures efficient data handling and reduces latency in data availability.</li><li><strong>Data Storage and Access</strong>: Glue serves as the data catalog, managing metadata and permissions. Data consumers, including analysts, designers, engineers, and executives, access this data through Databricks, leveraging Glue for structured queries.</li></ul><p>I found the use of MSK and Spark for scalable data ingestion and processing particularly interesting. This architecture supports real-time analytics, allowing Riot Games to quickly assess the impact of new patches and gameplay changes.</p><p>Check out the video and share your thoughts!</p><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="platform-connecting-70-million-cars"></a>Platform Connecting 70 Million Cars<a class="hash-link" href="#platform-connecting-70-million-cars" title="Direct link to heading">#</a></h4><p><img alt="Imagetitle" src="/assets/images/70m-cars-154036961a94497952e067432c0384b1.png"></p><p>▶ <a href="https://youtu.be/1nifzmvOGHs" target="_blank" rel="noopener noreferrer">Click here to watch</a></p><p>Hey everyone, I recorded a reaction video to an AWS architecture for a connected car platform by Mobileye. Presented by David John and the team at Mobileye, this system connects 70 million cars, collecting and processing data to offer digital services and fleet analysis.</p><p>Key components:</p><ul><li><strong>Data Collection</strong>: Cars collect anonymized data using sensors and visual inspections, sending it to a REST API and storing it in S3.</li><li><strong>Data Processing</strong>: The data is pulled from S3 into SQS and processed by EKS workers, which scale according to the queue size. Processed data is stored back in S3 and further analyzed using step functions and Lambda for tasks like extracting construction zones and clustering observations.</li><li><strong>Data Storage</strong>: Processed data is stored in S3, Elasticsearch, and CockroachDB. Elasticsearch handles document-based data with self-indexing, while CockroachDB supports frequent updates.</li><li><strong>Data Consumption</strong>: EKS hosts a secured REST API and web application, allowing customers like city planners to access insights on pedestrian and bicycle traffic.</li></ul><p>Future plans include enabling cloud image processing on EKS with GPU instances and focusing on cost reduction as data flow increases.</p><p>I found the use of EKS for scalable data processing and the combination of Elasticsearch and CockroachDB for different data needs particularly interesting. This architecture efficiently handles large-scale data from millions of connected cars.</p><p>Check out the video and share your thoughts!</p><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="55tb-a-day-nielsen-aws-data-architecture"></a>55TB A Day: Nielsen AWS Data Architecture<a class="hash-link" href="#55tb-a-day-nielsen-aws-data-architecture" title="Direct link to heading">#</a></h4><p><img alt="Imagetitle" src="/assets/images/55-tb-9acce6d6992738efa480500925295d79.png"></p><p>▶ <a href="https://youtu.be/WCQe1VP_q5A" target="_blank" rel="noopener noreferrer">Click here to watch</a></p><p>Hey everyone, I recorded a reaction video to an AWS architecture for Nielsen Marketing Cloud, which processes 55TB of data daily. Presented by David John, this system handles marketing segmentation data for campaigns.</p><p>Key components:</p><ul><li><strong>Data Ingestion</strong>: Marketing data comes in files, written to S3. Spark on EMR processes and transforms the data, writing the output to another S3 bucket.</li><li><strong>Data Processing</strong>: Lambda functions handle the final formatting and upload the data to over 100 ad networks. Metadata about file processing is managed in a PostgreSQL RDS database.</li><li><strong>Metadata Management</strong>: A work manager Lambda reads metadata from RDS, triggers processing jobs in EMR, and updates the metadata post-processing.</li><li><strong>Scaling and Rate Limiting</strong>: The serverless architecture allows automatic scaling. However, rate limiting is implemented to prevent overloading ad networks, ensuring they handle data bursts smoothly.</li></ul><p>Challenges and Solutions:</p><ul><li><strong>Scale</strong>: The system handles 250 billion events per day, scaling up and down automatically to manage peak loads.</li><li><strong>Rate Limiting</strong>: To avoid overwhelming ad networks, a rate-limiting mechanism was introduced, managing data flow based on network capacity.</li><li><strong>Back Pressure Management</strong>: SQS is used to buffer Lambda responses, preventing direct overload on the PostgreSQL database.</li></ul><p>I found the use of SQS for metadata management and the serverless architecture for handling massive data loads particularly interesting. This setup ensures efficient data processing and smooth delivery to ad networks.</p><p>Check out the video and share your thoughts!</p><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="orange-theory-fitness"></a>Orange Theory Fitness<a class="hash-link" href="#orange-theory-fitness" title="Direct link to heading">#</a></h4><p><img alt="Image" src="/assets/images/fitness-1-c8c166aad92b24e34e1b3d89cd258691.jpeg"></p><p>▶ <a href="https://youtu.be/ssaXRo5s1r4" target="_blank" rel="noopener noreferrer">Click here to watch</a></p><p>Hey, everybody! Today, I&#x27;m reacting to the AWS data infrastructure at Orange Theory Fitness, where they collect data from wristbands and training machines. Let&#x27;s dive in and see how they manage it all.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="key-components"></a>Key Components<a class="hash-link" href="#key-components" title="Direct link to heading">#</a></h3><ol><li><strong>Local Server</strong>: Aggregates data from in-studio equipment and mobile apps, ensuring resiliency if the cloud connection is lost.</li><li><strong>API Gateway and Cognito</strong>: Handle authentication and route data to the cloud.</li><li><strong>Lambda Functions</strong>: Process data.</li><li><strong>Aurora RDS (MySQL)</strong>: Stores structured data like member profiles, class bookings, and studio information.</li><li><strong>DynamoDB</strong>: Stores performance metrics and workout statistics for quick access.</li><li><strong>S3</strong>: Serves as a data lake, storing telemetry data.</li><li><strong>Kinesis Firehose</strong>: Streams telemetry data to S3.</li></ol><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="challenges--solutions"></a>Challenges &amp; Solutions<a class="hash-link" href="#challenges--solutions" title="Direct link to heading">#</a></h3><ol><li><p><strong>Resiliency</strong></p><ul><li><strong>Challenge</strong>: Ensure operations continue if cloud connection is lost.</li><li><strong>Solution</strong>: Local server aggregates data and syncs with the cloud once the connection is restored.</li></ul></li><li><p><strong>Data Integration</strong></p><ul><li><strong>Challenge</strong>: Integrate data from various sources.</li><li><strong>Solution</strong>: Use API Gateway and Cognito for unified authentication and data routing.</li></ul></li><li><p><strong>Data Processing</strong></p><ul><li><strong>Challenge</strong>: Efficiently process and store different types of data.</li><li><strong>Solution</strong>: Use Lambda for processing, Aurora RDS for structured data, DynamoDB for quick access to performance metrics, and Kinesis Firehose with S3 for streaming and storing large volumes of telemetry data.</li></ul></li></ol><p>This architecture leverages AWS tools for scalability, flexibility, and resilience, making it an excellent example of a well-thought-out data infrastructure for a fitness application.</p><p>Let me know your thoughts in the comments. What do you think of this architecture? Would you have done anything differently? If you have any questions, feel free to ask. And if you&#x27;re interested in learning more about data engineering, check out my academy at learndataengineering.com. See you in the next video!</p><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="more-details"></a>More Details<a class="hash-link" href="#more-details" title="Direct link to heading">#</a></h2><p>AWS Whitepapers:</p><p><a href="https://d1.awsstatic.com/whitepapers/aws-overview.pdf" target="_blank" rel="noopener noreferrer">https://d1.awsstatic.com/whitepapers/aws-overview.pdf</a></p><h1><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="azure"></a>Azure<a class="hash-link" href="#azure" title="Direct link to heading">#</a></h1><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="connect-1"></a>Connect<a class="hash-link" href="#connect-1" title="Direct link to heading">#</a></h2><ul><li>Event Hub</li><li>IoT Hub</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="buffer-1"></a>Buffer<a class="hash-link" href="#buffer-1" title="Direct link to heading">#</a></h2><ul><li>Data Factory</li><li>Event Hub</li><li>RedisCache (also Store)</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="processing-1"></a>Processing<a class="hash-link" href="#processing-1" title="Direct link to heading">#</a></h2><ul><li>Stream Analytics Service</li><li>Azure Databricks</li><li>Machine Learning</li><li>Azure Functions</li><li>Azure HDInsight (Hadoop PaaS)</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="store-1"></a>Store<a class="hash-link" href="#store-1" title="Direct link to heading">#</a></h2><ul><li>Blob</li><li>CosmosDB</li><li>MariaDB</li><li>MySQL</li><li>PostgreSQL</li><li>SQL</li><li>Azure Data lake</li><li>Azure Storage (SQL Table?)</li><li>Azure Synapse Analytics</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="visualize-1"></a>Visualize<a class="hash-link" href="#visualize-1" title="Direct link to heading">#</a></h2><ul><li>PowerBI</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="containerization-1"></a>Containerization<a class="hash-link" href="#containerization-1" title="Direct link to heading">#</a></h2><ul><li>Virtual Machines</li><li>Virtual Machine Scale Sets</li><li>Azure Container Service (AKS)</li><li>Container Instances</li><li>Azure Kubernetes Service</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="best-practices-1"></a>Best Practices<a class="hash-link" href="#best-practices-1" title="Direct link to heading">#</a></h2><p>Advanced Analytics Architecture:</p><p><a href="https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/advanced-analytics-on-big-data" target="_blank" rel="noopener noreferrer">https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/advanced-analytics-on-big-data</a></p><p>Anomaly Detection in Real-time Data Streams:</p><p><a href="https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/anomaly-detection-in-real-time-data-streams" target="_blank" rel="noopener noreferrer">https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/anomaly-detection-in-real-time-data-streams</a></p><p>Modern Data Warehouse Architecture:</p><p><a href="https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/modern-data-warehouse" target="_blank" rel="noopener noreferrer">https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/modern-data-warehouse</a></p><p>CI/CD for Containers:</p><p><a href="https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/cicd-for-containers" target="_blank" rel="noopener noreferrer">https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/cicd-for-containers</a></p><p>Real Time Analytics on Big Data Architecture:</p><p><a href="https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/real-time-analytics" target="_blank" rel="noopener noreferrer">https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/real-time-analytics</a></p><p>Anomaly Detection in Real-time Data Streams:</p><p><a href="https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/anomaly-detection-in-real-time-data-streams" target="_blank" rel="noopener noreferrer">https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/anomaly-detection-in-real-time-data-streams</a></p><p>IoT Architecture – Azure IoT Subsystems:</p><p><a href="https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/azure-iot-subsystems" target="_blank" rel="noopener noreferrer">https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/azure-iot-subsystems</a></p><p>Tier Applications &amp; Data for Analytics:</p><p><a href="https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/tiered-data-for-analytics" target="_blank" rel="noopener noreferrer">https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/tiered-data-for-analytics</a></p><p>Extract, transform, and load (ETL) using HDInsight:</p><p><a href="https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/extract-transform-and-load-using-hdinsight" target="_blank" rel="noopener noreferrer">https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/extract-transform-and-load-using-hdinsight</a></p><p>IoT using Cosmos DB:</p><p><a href="https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/iot-using-cosmos-db" target="_blank" rel="noopener noreferrer">https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/iot-using-cosmos-db</a></p><p>Streaming using HDInsight:</p><p><a href="https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/streaming-using-hdinsight" target="_blank" rel="noopener noreferrer">https://docs.microsoft.com/en-us/azure/architecture/solution-ideas/articles/streaming-using-hdinsight</a></p><h1><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="gcp"></a>GCP<a class="hash-link" href="#gcp" title="Direct link to heading">#</a></h1><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="connect-2"></a>Connect<a class="hash-link" href="#connect-2" title="Direct link to heading">#</a></h2><ul><li>Cloud IoT Core</li><li>App Engine</li><li>Cloud Dataflow</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="buffer-2"></a>Buffer<a class="hash-link" href="#buffer-2" title="Direct link to heading">#</a></h2><ul><li>Pub/Sub</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="processing-2"></a>Processing<a class="hash-link" href="#processing-2" title="Direct link to heading">#</a></h2><ul><li>Compute Engine</li><li>Cloud Functions</li><li>Specialized tools:<ul><li>Cloud Dataflow</li><li>Cloud Dataproc</li><li>Cloud Datalab</li><li>Cloud Dataprep</li><li>Cloud Composer</li></ul></li><li>App Engine</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="store-2"></a>Store<a class="hash-link" href="#store-2" title="Direct link to heading">#</a></h2><ul><li>Cloud Storage</li><li>Cloud SQL</li><li>Cloud Spanner</li><li>Cloud Datastore</li><li>Cloud BigTable</li><li>Cloud Storage</li><li>Cloud Memorystore</li><li>BigQuery</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="visualize-2"></a>Visualize<a class="hash-link" href="#visualize-2" title="Direct link to heading">#</a></h2><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="containerization-2"></a>Containerization<a class="hash-link" href="#containerization-2" title="Direct link to heading">#</a></h2><ul><li>Kubernetes Engine</li><li>Container Security</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="best-practices-2"></a>Best Practices<a class="hash-link" href="#best-practices-2" title="Direct link to heading">#</a></h2><p>Thanks to Ismail Holoubi for the following GCP links</p><p>Best practices for migrating virtual machines to Compute Engine:</p><p><a href="https://cloud.google.com/solutions/best-practices-migrating-vm-to-compute-engine" target="_blank" rel="noopener noreferrer">https://cloud.google.com/solutions/best-practices-migrating-vm-to-compute-engine</a></p><p>Best practices for Cloud Storage:</p><p><a href="https://cloud.google.com/storage/docs/best-practices" target="_blank" rel="noopener noreferrer">https://cloud.google.com/storage/docs/best-practices</a></p><p>Moving a publishing workflow to BigQuery for new data insights:</p><p><a href="https://cloud.google.com/blog/products/data-analytics/moving-a-publishing-workflow-to-bigquery-for-new-data-insights" target="_blank" rel="noopener noreferrer">https://cloud.google.com/blog/products/data-analytics/moving-a-publishing-workflow-to-bigquery-for-new-data-insights</a></p><p>Architecture: Optimizing large-scale ingestion of analytics events and logs:</p><p><a href="https://cloud.google.com/solutions/architecture/optimized-large-scale-analytics-ingestion" target="_blank" rel="noopener noreferrer">https://cloud.google.com/solutions/architecture/optimized-large-scale-analytics-ingestion</a></p><p>Choosing the right architecture for global data distribution:</p><p><a href="https://cloud.google.com/solutions/architecture/global-data-distribution" target="_blank" rel="noopener noreferrer">https://cloud.google.com/solutions/architecture/global-data-distribution</a></p><p>Best Practices for Operating Containers:</p><p><a href="https://cloud.google.com/solutions/best-practices-for-operating-containers" target="_blank" rel="noopener noreferrer">https://cloud.google.com/solutions/best-practices-for-operating-containers</a></p><p>Automating IoT Machine Learning: Bridging Cloud and Device Benefits with AI Platform:</p><p><a href="https://cloud.google.com/solutions/automating-iot-machine-learning" target="_blank" rel="noopener noreferrer">https://cloud.google.com/solutions/automating-iot-machine-learning</a></p></div></article><div class="margin-vert--lg"><nav class="pagination-nav" aria-label="Blog list page navigation"><div class="pagination-nav__item"><a class="pagination-nav__link" href="/docs/05-CaseStudies"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">« 05-CaseStudies</div></a></div><div class="pagination-nav__item pagination-nav__item--next"><a class="pagination-nav__link" href="/docs/07-DataSources"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">07-DataSources »</div></a></div></nav></div></div></div><div class="col col--3"><div class="tableOfContents_35-E thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#contents" class="table-of-contents__link">Contents</a></li><li><a href="#connect" class="table-of-contents__link">Connect</a></li><li><a href="#buffer" class="table-of-contents__link">Buffer</a></li><li><a href="#processing" class="table-of-contents__link">Processing</a></li><li><a href="#store" class="table-of-contents__link">Store</a></li><li><a href="#visualize" class="table-of-contents__link">Visualize</a></li><li><a href="#containerization" class="table-of-contents__link">Containerization</a></li><li><a href="#best-practices" class="table-of-contents__link">Best Practices</a><ul><li><a href="#key-components" class="table-of-contents__link">Key Components</a></li><li><a href="#challenges--solutions" class="table-of-contents__link">Challenges &amp; Solutions</a></li></ul></li><li><a href="#more-details" class="table-of-contents__link">More Details</a></li><li><a href="#connect-1" class="table-of-contents__link">Connect</a></li><li><a href="#buffer-1" class="table-of-contents__link">Buffer</a></li><li><a href="#processing-1" class="table-of-contents__link">Processing</a></li><li><a href="#store-1" class="table-of-contents__link">Store</a></li><li><a href="#visualize-1" class="table-of-contents__link">Visualize</a></li><li><a href="#containerization-1" class="table-of-contents__link">Containerization</a></li><li><a href="#best-practices-1" class="table-of-contents__link">Best Practices</a></li><li><a href="#connect-2" class="table-of-contents__link">Connect</a></li><li><a href="#buffer-2" class="table-of-contents__link">Buffer</a></li><li><a href="#processing-2" class="table-of-contents__link">Processing</a></li><li><a href="#store-2" class="table-of-contents__link">Store</a></li><li><a href="#visualize-2" class="table-of-contents__link">Visualize</a></li><li><a href="#containerization-2" class="table-of-contents__link">Containerization</a></li><li><a href="#best-practices-2" class="table-of-contents__link">Best Practices</a></li></ul></div></div></div></div></main></div></div><footer class="footer"><div class="container"><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2024 Andreas Kretz. Built by Kristijan Bakaric with Docusaurus.</div></div></div></footer></div>
 <script src="/assets/js/styles.4aacfb65.js"></script>
-<script src="/assets/js/runtime~main.33a05ad1.js"></script>
+<script src="/assets/js/runtime~main.96f9bfc8.js"></script>
 <script src="/assets/js/main.da980c98.js"></script>
 <script src="/assets/js/1.4743d043.js"></script>
 <script src="/assets/js/2.d8ee9dd6.js"></script>
diff --git a/docs/07-DataSources/index.html b/docs/07-DataSources/index.html
index d0ef4d5..e1c6a05 100644
--- a/docs/07-DataSources/index.html
+++ b/docs/07-DataSources/index.html
@@ -7,7 +7,7 @@
 <link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="THE DATA ENGINEERING COOKBOOK Blog RSS Feed">
 <link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="THE DATA ENGINEERING COOKBOOK Blog Atom Feed"><title data-react-helmet="true">07-DataSources | THE DATA ENGINEERING COOKBOOK</title><meta data-react-helmet="true" name="twitter:card" content="summary_large_image"><meta data-react-helmet="true" name="docusaurus_locale" content="en"><meta data-react-helmet="true" name="docusaurus_version" content="current"><meta data-react-helmet="true" name="docusaurus_tag" content="docs-default-current"><meta data-react-helmet="true" property="og:title" content="07-DataSources | THE DATA ENGINEERING COOKBOOK"><meta data-react-helmet="true" name="description" content="100 Plus Data Sources Data Science"><meta data-react-helmet="true" property="og:description" content="100 Plus Data Sources Data Science"><meta data-react-helmet="true" property="og:url" content="http://cookbook.learndataengineering.com/docs/07-DataSources"><link data-react-helmet="true" rel="shortcut icon" href="/images/CookbookCover.jpg"><link data-react-helmet="true" rel="alternate" href="http://cookbook.learndataengineering.com/docs/07-DataSources" hreflang="x-default"><link data-react-helmet="true" rel="canonical" href="http://cookbook.learndataengineering.com/docs/07-DataSources"><link rel="stylesheet" href="/assets/css/styles.5ba6ab3f.css">
 <link rel="preload" href="/assets/js/styles.4aacfb65.js" as="script">
-<link rel="preload" href="/assets/js/runtime~main.33a05ad1.js" as="script">
+<link rel="preload" href="/assets/js/runtime~main.96f9bfc8.js" as="script">
 <link rel="preload" href="/assets/js/main.da980c98.js" as="script">
 <link rel="preload" href="/assets/js/1.4743d043.js" as="script">
 <link rel="preload" href="/assets/js/2.d8ee9dd6.js" as="script">
@@ -23,7 +23,7 @@
 So, I started this section to make it easier to find good sources.</p><p>I&#x27;ve taken these links from articles and blog posts. Why not only link the articles?
 You know, these posts can go away at any time. I want to keep the links to the platforms either way.</p><p>I haven&#x27;t had the chance to check each link myself. Please let me know if something isn&#x27;t right.</p><p>You can find the articles on the bottom of this section to read more. They include even more data sources I haven&#x27;t had time to add to this list.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="contents"></a>Contents:<a class="hash-link" href="#contents" title="Direct link to heading">#</a></h2><ul><li><a href="/docs/07-DataSources#Content-Marketing">Content Marketing</a></li><li><a href="/docs/07-DataSources#Crime">Crime</a></li><li><a href="/docs/07-DataSources#Drugs">Drugs</a></li><li><a href="/docs/07-DataSources#Education">Education</a></li><li><a href="/docs/07-DataSources#Entertainment">Entertainment</a></li><li><a href="/docs/07-DataSources#Environmental-And-Weather-Data">Environmental And Weather Data</a></li><li><a href="/docs/07-DataSources#Financial-And-Economic-Data%5D">Financial And Economic Data</a></li><li><a href="/docs/07-DataSources#General-And-Academic">General And Academic</a></li><li><a href="/docs/07-DataSources#Government-And-World">Government And World</a></li><li><a href="/docs/07-DataSources#Health">Health</a></li><li><a href="/docs/07-DataSources#Human-Rights">Human Rights</a></li><li><a href="/docs/07-DataSources#Labor-And-Employment-Data">Labor And Employment Data</a></li><li><a href="/docs/07-DataSources#Politics">Politics</a></li><li><a href="/docs/07-DataSources#Retail">Retail</a></li><li><a href="/docs/07-DataSources#Social">Social</a></li><li><a href="/docs/07-DataSources#Source-Articles-and-Blog-Posts">Source Articles and Blog Posts</a></li><li><a href="/docs/07-DataSources#Travel-And-Transportation">Travel And Transportation</a></li><li><a href="/docs/07-DataSources#Various-Portals">Various Portals</a></li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="general-and-academic"></a>General And Academic<a class="hash-link" href="#general-and-academic" title="Direct link to heading">#</a></h2><ul><li><a href="https://registry.opendata.aws/" target="_blank" rel="noopener noreferrer">Amazon Public Data Sets</a></li><li><a href="https://www.reddit.com/r/datasets" target="_blank" rel="noopener noreferrer">Datasets Subreddit</a></li><li><a href="https://public.enigma.com/" target="_blank" rel="noopener noreferrer">Enigma Public</a></li><li><a href="http://fivethirtyeight.com/" target="_blank" rel="noopener noreferrer">FiveThirtyEight</a></li><li><a href="http://scholar.google.com/" target="_blank" rel="noopener noreferrer">Google Scholar</a></li><li><a href="http://www.pewresearch.org/" target="_blank" rel="noopener noreferrer">Pew Research</a></li><li><a href="http://www.nytimes.com/section/upshot" target="_blank" rel="noopener noreferrer">The Upshot by New York Times</a></li><li><a href="http://data.un.org/" target="_blank" rel="noopener noreferrer">UNData</a></li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="content-marketing"></a>Content Marketing<a class="hash-link" href="#content-marketing" title="Direct link to heading">#</a></h2><ul><li><a href="https://blog.bufferapp.com/" target="_blank" rel="noopener noreferrer">Buffer</a></li><li><a href="http://contentmarketinginstitute.com/about/" target="_blank" rel="noopener noreferrer">Content Marketing Institute</a></li><li><a href="http://www.hubspot.com/marketing-statistics" target="_blank" rel="noopener noreferrer">HubSpot</a></li><li><a href="https://moz.com/blog" target="_blank" rel="noopener noreferrer">Moz</a></li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="crime"></a>Crime<a class="hash-link" href="#crime" title="Direct link to heading">#</a></h2><ul><li><a href="http://www.bjs.gov/index.cfm?ty=dca" target="_blank" rel="noopener noreferrer">Bureau of Justice Statistics</a></li><li><a href="https://www.fbi.gov/stats-services/crimestats" target="_blank" rel="noopener noreferrer">FBI Crime Statistics</a></li><li><a href="https://www.icpsr.umich.edu/icpsrweb/NACJD/" target="_blank" rel="noopener noreferrer">National Archive of Criminal Justice Data</a></li><li><a href="https://crime-data-explorer.fr.cloud.gov/" target="_blank" rel="noopener noreferrer">Uniform Crime Reporting Statistics</a></li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="drugs"></a>Drugs<a class="hash-link" href="#drugs" title="Direct link to heading">#</a></h2><ul><li><a href="http://www.fdbhealth.com/" target="_blank" rel="noopener noreferrer">Drug Data and Database by First Databank</a></li><li><a href="http://www.drugwarfacts.org/" target="_blank" rel="noopener noreferrer">Drug War Facts</a></li><li><a href="https://www.drugabuse.gov/related-topics/trends-statistics" target="_blank" rel="noopener noreferrer">National Institute on Drug Abuse</a></li><li><a href="http://www.fda.gov/Drugs/InformationOnDrugs/ucm079750.htm" target="_blank" rel="noopener noreferrer">U.S. Food and Drug Administration</a></li><li><a href="https://www.unodc.org/unodc/en/data-and-analysis/" target="_blank" rel="noopener noreferrer">United Nations Office on Drugs and Crime</a></li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="education"></a>Education<a class="hash-link" href="#education" title="Direct link to heading">#</a></h2><ul><li><a href="http://data.worldbank.org/topic/education" target="_blank" rel="noopener noreferrer">Education Data by the World Bank</a></li><li><a href="http://data.unicef.org/education/overview.html" target="_blank" rel="noopener noreferrer">Education Data by Unicef</a></li><li><a href="https://nces.ed.gov/" target="_blank" rel="noopener noreferrer">National Center for Education Statistics</a></li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="entertainment"></a>Entertainment<a class="hash-link" href="#entertainment" title="Direct link to heading">#</a></h2><ul><li><a href="http://www.academicrightspress.com/entertainment/music" target="_blank" rel="noopener noreferrer">Academic Rights Press</a></li><li><a href="http://www.bfi.org.uk/education-research/film-industry-statistics-research" target="_blank" rel="noopener noreferrer">BFI Film Forever</a></li><li><a href="http://www.bls.gov/iag/tgs/iag71.htm" target="_blank" rel="noopener noreferrer">BLS: Arts, Entertainment, and Recreation</a></li><li><a href="http://www.ifpi.org/global-statistics.php" target="_blank" rel="noopener noreferrer">IFPI</a></li><li><a href="https://aws.amazon.com/datasets/million-song-dataset/" target="_blank" rel="noopener noreferrer">Million Song Dataset</a></li><li><a href="http://www.statista.com/topics/964/film/" target="_blank" rel="noopener noreferrer">Statista: Film Industry</a></li><li><a href="http://www.statista.com/topics/1639/music/" target="_blank" rel="noopener noreferrer">Statista: Music Industry</a></li><li><a href="http://www.statista.com/topics/868/video-games/" target="_blank" rel="noopener noreferrer">Statista: Video Game Industry</a></li><li><a href="http://www.the-numbers.com/" target="_blank" rel="noopener noreferrer">The Numbers</a></li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="environmental-and-weather-data"></a>Environmental And Weather Data<a class="hash-link" href="#environmental-and-weather-data" title="Direct link to heading">#</a></h2><ul><li><a href="https://www.epa.gov/data" target="_blank" rel="noopener noreferrer">Environmental Protection Agency</a></li><li><a href="https://www.iea.org/data-and-statistics?country=WORLD&amp;fuel=Energy%20supply&amp;indicator=TPESbySource" target="_blank" rel="noopener noreferrer">International Energy Agency Atlas</a></li><li><a href="http://www.cdc.gov/nceh/data.htm" target="_blank" rel="noopener noreferrer">National Center for Environmental Health</a></li><li><a href="http://www.ncdc.noaa.gov/data-access/quick-links#loc-clim" target="_blank" rel="noopener noreferrer">National Climatic Data Center</a></li><li><a href="http://www.weather.gov/help-past-weather" target="_blank" rel="noopener noreferrer">National Weather Service</a></li><li><a href="https://www.wunderground.com/" target="_blank" rel="noopener noreferrer">Weather Underground</a></li><li><a href="http://www.weatherbase.com/" target="_blank" rel="noopener noreferrer">WeatherBase</a></li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="financial-and-economic-data"></a>Financial And Economic Data<a class="hash-link" href="#financial-and-economic-data" title="Direct link to heading">#</a></h2><ul><li><a href="https://fred.stlouisfed.org/" target="_blank" rel="noopener noreferrer">Federal Reserve Economic Database</a></li><li><a href="/docs/">Financial Data Finder at OSU</a> - Missing link.</li><li><a href="https://www.globalfinancialdata.com/index.html" target="_blank" rel="noopener noreferrer">Global Financial Data</a></li><li><a href="https://www.google.com/finance" target="_blank" rel="noopener noreferrer">Google Finance</a></li><li><a href="http://www.google.com/publicdata/directory" target="_blank" rel="noopener noreferrer">Google Public Data Explorer</a></li><li><a href="https://data.imf.org/?sk=388dfa60-1d26-4ade-b505-a05a558d9a42" target="_blank" rel="noopener noreferrer">IMF Economic Data</a></li><li><a href="http://www.nber.org/data/" target="_blank" rel="noopener noreferrer">National Bureau of Economic Research</a></li><li><a href="https://opencorporates.com/" target="_blank" rel="noopener noreferrer">OpenCorporates</a></li><li><a href="http://atlas.cid.harvard.edu/" target="_blank" rel="noopener noreferrer">The Atlas of Economic Complexity</a></li><li><a href="http://www.bea.gov/" target="_blank" rel="noopener noreferrer">U.S. Bureau of Economic Analysis</a></li><li><a href="https://www.sec.gov/dera/data/financial-statement-data-sets.html" target="_blank" rel="noopener noreferrer">U.S. Securities and Exchange Commission</a></li><li><a href="https://comtrade.un.org/labs/" target="_blank" rel="noopener noreferrer">UN Comtrade Database</a></li><li><a href="http://visualizingeconomics.com/" target="_blank" rel="noopener noreferrer">Visualizing Economics</a></li><li><a href="http://www.doingbusiness.org/rankings" target="_blank" rel="noopener noreferrer">World Bank Doing Business Database</a></li><li><a href="http://data.worldbank.org/" target="_blank" rel="noopener noreferrer">World Bank Open Data</a></li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="government-and-world"></a>Government And World<a class="hash-link" href="#government-and-world" title="Direct link to heading">#</a></h2><ul><li><a href="http://www.data.gov/" target="_blank" rel="noopener noreferrer">Data.gov</a></li><li><a href="http://data.europa.eu/euodp/en/data/" target="_blank" rel="noopener noreferrer">European Union Open Data Portal</a></li><li><a href="https://www.gapminder.org/data/" target="_blank" rel="noopener noreferrer">Gapminder</a></li><li><a href="http://landmatrix.org/en/" target="_blank" rel="noopener noreferrer">Land Matrix (Transnational Land Database)</a></li><li><a href="http://www.oecd.org/dac/financing-sustainable-development/development-finance-data/" target="_blank" rel="noopener noreferrer">OECD Aid Database</a></li><li><a href="http://www.opendatanetwork.com/" target="_blank" rel="noopener noreferrer">Open Data Network</a></li><li><a href="https://www.cia.gov/the-world-factbook/" target="_blank" rel="noopener noreferrer">The CIA World Factbook</a></li><li><a href="http://data.worldbank.org/data-catalog/world-development-indicators" target="_blank" rel="noopener noreferrer">The World Bank’s World Development Indicators</a></li><li><a href="http://www.census.gov/" target="_blank" rel="noopener noreferrer">U.S. Census Bureau</a></li><li><a href="http://hdr.undp.org/en/data" target="_blank" rel="noopener noreferrer">UNDP’s Human Development Index</a></li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="health"></a>Health<a class="hash-link" href="#health" title="Direct link to heading">#</a></h2><ul><li><a href="http://www.americashealthrankings.org/" target="_blank" rel="noopener noreferrer">America’s Health Rankings</a></li><li><a href="http://www.cdc.gov/datastatistics/" target="_blank" rel="noopener noreferrer">Centers for Disease Control and Prevention</a></li><li><a href="http://www.hscic.gov.uk/home" target="_blank" rel="noopener noreferrer">Health &amp; Social Care Information Centre</a></li><li><a href="https://www.nlm.nih.gov/hsrinfo/datasites.html" target="_blank" rel="noopener noreferrer">Health Services Research Information Central</a></li><li><a href="https://www.healthdata.gov/" target="_blank" rel="noopener noreferrer">HealthData.gov</a></li><li><a href="https://data.medicare.gov/data/hospital-compare#" target="_blank" rel="noopener noreferrer">Medicare Hospital Quality</a></li><li><a href="https://www.nlm.nih.gov/medlineplus/healthstatistics.html" target="_blank" rel="noopener noreferrer">MedicinePlus</a></li><li><a href="http://www.cdc.gov/nchs/" target="_blank" rel="noopener noreferrer">National Center for Health Statistics</a></li><li><a href="http://seer.cancer.gov/faststats/selections.php?series=cancer" target="_blank" rel="noopener noreferrer">SEER Cancer Incidence</a></li><li><a href="http://www.who.int/en/" target="_blank" rel="noopener noreferrer">World Health Organization</a></li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="human-rights"></a>Human Rights<a class="hash-link" href="#human-rights" title="Direct link to heading">#</a></h2><ul><li><a href="https://www.amnesty.org/en/search/?q=&amp;documentType=Annual+Report" target="_blank" rel="noopener noreferrer">Amnesty International</a></li><li><a href="https://hrdag.org/" target="_blank" rel="noopener noreferrer">Human Rights Data Analysis Group</a></li><li><a href="http://www.pcr.uu.se/research/UCDP/" target="_blank" rel="noopener noreferrer">The Armed Conflict Database by Uppsala University</a></li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="labor-and-employment-data"></a>Labor And Employment Data<a class="hash-link" href="#labor-and-employment-data" title="Direct link to heading">#</a></h2><ul><li><a href="http://www.bls.gov/" target="_blank" rel="noopener noreferrer">Bureau of Labor Statistics</a></li><li><a href="https://www.dol.gov/general/topic/statistics/employment" target="_blank" rel="noopener noreferrer">Department of Labor</a></li><li><a href="http://www.census.gov/topics/employment.html" target="_blank" rel="noopener noreferrer">Employment by U.S. Census</a></li><li><a href="https://www.sba.gov/starting-business/how-start-business/business-data-statistics/employment-statistics" target="_blank" rel="noopener noreferrer">U.S. Small Business Administration</a></li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="politics"></a>Politics<a class="hash-link" href="#politics" title="Direct link to heading">#</a></h2><ul><li><a href="http://dlab.berkeley.edu/data-resources/california-polls" target="_blank" rel="noopener noreferrer">California Field Poll</a></li><li><a href="https://www.crowdpac.com/" target="_blank" rel="noopener noreferrer">Crowdpac</a></li><li><a href="http://www.gallup.com/home.aspx" target="_blank" rel="noopener noreferrer">Gallup</a></li><li><a href="https://www.opensecrets.org/" target="_blank" rel="noopener noreferrer">Open Secrets</a></li><li><a href="http://www.randstatestats.org/us/" target="_blank" rel="noopener noreferrer">Rand State Statistics</a></li><li><a href="http://guides.lib.berkeley.edu/Intro-to-Political-Science-Research/Stats" target="_blank" rel="noopener noreferrer">Real Clear Politics</a></li><li><a href="https://ropercenter.cornell.edu/" target="_blank" rel="noopener noreferrer">Roper Center for Public Opinion Research</a></li><li><a href="http://voterlist.electproject.org/" target="_blank" rel="noopener noreferrer">US Voter Files</a> Note only some states are free, and most do not allow voter files to be used for commercial purposes - this map allows you to see the rules/cost for each state.</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="retail"></a>Retail<a class="hash-link" href="#retail" title="Direct link to heading">#</a></h2><ul><li><a href="https://www.lovethesales.com/press/data-request" target="_blank" rel="noopener noreferrer">Love the Sales</a></li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="social"></a>Social<a class="hash-link" href="#social" title="Direct link to heading">#</a></h2><ul><li><a href="https://developers.facebook.com/docs/graph-api" target="_blank" rel="noopener noreferrer">Facebook Graph API</a></li><li><a href="http://www.google.com/trends/explore" target="_blank" rel="noopener noreferrer">Google Trends</a></li><li><a href="/docs/">SocialMention</a> - Missing link.</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="travel-and-transportation"></a>Travel And Transportation<a class="hash-link" href="#travel-and-transportation" title="Direct link to heading">#</a></h2><ul><li><a href="https://www.bts.gov/browse-statistical-products-and-data" target="_blank" rel="noopener noreferrer">Bureau of Transportation Statistics</a></li><li><a href="http://travel.trade.gov/research/monthly/departures/" target="_blank" rel="noopener noreferrer">Monthly Tourism Statistics – U.S. Travelers Overseas</a></li><li><a href="http://www.geoba.se/" target="_blank" rel="noopener noreferrer">Search the World</a></li><li><a href="https://skift.com/skiftx/skiftstats/" target="_blank" rel="noopener noreferrer">SkiftStats</a></li><li><a href="https://www.ustravel.org/research" target="_blank" rel="noopener noreferrer">U.S. Travel Association</a></li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="various-portals"></a>Various Portals<a class="hash-link" href="#various-portals" title="Direct link to heading">#</a></h2><ul><li><a href="https://ckan.org/" target="_blank" rel="noopener noreferrer">Ckan</a></li><li><a href="https://dataverse.org/" target="_blank" rel="noopener noreferrer">Dataverse</a></li><li><a href="https://wiki.dbpedia.org/" target="_blank" rel="noopener noreferrer">DBpedia</a></li><li><a href="https://github.com/freeCodeCamp/open-data" target="_blank" rel="noopener noreferrer">freeCodeCamp Open Data</a></li><li><a href="https://www.kaggle.com/datasets" target="_blank" rel="noopener noreferrer">Kaggle</a></li><li><a href="https://lodum.de/" target="_blank" rel="noopener noreferrer">LODUM</a></li><li><a href="http://opendataimpactmap.org/" target="_blank" rel="noopener noreferrer">Open Data Ipact Map</a></li><li><a href="https://opendatakit.org/" target="_blank" rel="noopener noreferrer">Open Data Kit</a></li><li><a href="https://opendatamonitor.eu/frontend/web/index.php?r=dashboard%2Findex" target="_blank" rel="noopener noreferrer">Open Data Monitor</a></li><li><a href="http://plenar.io/" target="_blank" rel="noopener noreferrer">Plenar.io</a></li><li><a href="https://archive.ics.uci.edu/ml/index.php" target="_blank" rel="noopener noreferrer">UCI Machine Learning Repository</a></li><li><a href="https://www.yelp.com/dataset" target="_blank" rel="noopener noreferrer">Yelp Open Datasets</a></li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="source-articles-and-blog-posts"></a>Source Articles and Blog Posts<a class="hash-link" href="#source-articles-and-blog-posts" title="Direct link to heading">#</a></h2><ul><li><a href="https://www.columnfivemedia.com/100-best-free-data-sources-infographic" target="_blank" rel="noopener noreferrer">100+ of the Best Free Data Sources For Your Next Project</a></li><li><a href="https://medium.com/@Infogram/15-great-free-data-sources-for-2016-25cb455db257" target="_blank" rel="noopener noreferrer">15 Great Free Data Sources for 2016</a></li><li><a href="https://www.searchenginejournal.com/free-data-sources/302601/#close" target="_blank" rel="noopener noreferrer">20 Awesome Sources of Free Data</a></li><li><a href="https://www.bernardmarr.com/default.asp?contentID=960" target="_blank" rel="noopener noreferrer">30+ Free Data Sources Every Company Should Be Aware Of</a></li><li><a href="https://infogram.com/blog/free-data-sources/" target="_blank" rel="noopener noreferrer">50 Amazing Free Data Sources You Should Know</a></li><li><a href="https://learn.g2.com/open-data-sources" target="_blank" rel="noopener noreferrer">50 Best Open Data Sources Ready to be Used Right Now</a></li><li><a href="https://www.kdnuggets.com/2017/12/big-data-free-sources.html" target="_blank" rel="noopener noreferrer">70 Amazing Free Data Sources You Should Know</a></li><li><a href="https://www.forbes.com/sites/bernardmarr/2016/02/12/big-data-35-brilliant-and-free-data-sources-for-2016/#527557ffb54d" target="_blank" rel="noopener noreferrer">Big Data: 33 Brilliant And Free Data Sources Anyone Can Use</a></li><li><a href="https://www.freecodecamp.org/news/https-medium-freecodecamp-org-best-free-open-data-sources-anyone-can-use-a65b514b0f2d/" target="_blank" rel="noopener noreferrer">These Are The Best Free Open Data Sources Anyone Can Use</a></li></ul></div></article><div class="margin-vert--lg"><nav class="pagination-nav" aria-label="Blog list page navigation"><div class="pagination-nav__item"><a class="pagination-nav__link" href="/docs/06-BestPracticesCloud"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">« 06-BestPracticesCloud</div></a></div><div class="pagination-nav__item pagination-nav__item--next"><a class="pagination-nav__link" href="/docs/08-InterviewQuestions"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">08-InterviewQuestions »</div></a></div></nav></div></div></div><div class="col col--3"><div class="tableOfContents_35-E thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#contents" class="table-of-contents__link">Contents:</a></li><li><a href="#general-and-academic" class="table-of-contents__link">General And Academic</a></li><li><a href="#content-marketing" class="table-of-contents__link">Content Marketing</a></li><li><a href="#crime" class="table-of-contents__link">Crime</a></li><li><a href="#drugs" class="table-of-contents__link">Drugs</a></li><li><a href="#education" class="table-of-contents__link">Education</a></li><li><a href="#entertainment" class="table-of-contents__link">Entertainment</a></li><li><a href="#environmental-and-weather-data" class="table-of-contents__link">Environmental And Weather Data</a></li><li><a href="#financial-and-economic-data" class="table-of-contents__link">Financial And Economic Data</a></li><li><a href="#government-and-world" class="table-of-contents__link">Government And World</a></li><li><a href="#health" class="table-of-contents__link">Health</a></li><li><a href="#human-rights" class="table-of-contents__link">Human Rights</a></li><li><a href="#labor-and-employment-data" class="table-of-contents__link">Labor And Employment Data</a></li><li><a href="#politics" class="table-of-contents__link">Politics</a></li><li><a href="#retail" class="table-of-contents__link">Retail</a></li><li><a href="#social" class="table-of-contents__link">Social</a></li><li><a href="#travel-and-transportation" class="table-of-contents__link">Travel And Transportation</a></li><li><a href="#various-portals" class="table-of-contents__link">Various Portals</a></li><li><a href="#source-articles-and-blog-posts" class="table-of-contents__link">Source Articles and Blog Posts</a></li></ul></div></div></div></div></main></div></div><footer class="footer"><div class="container"><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2024 Andreas Kretz. Built by Kristijan Bakaric with Docusaurus.</div></div></div></footer></div>
 <script src="/assets/js/styles.4aacfb65.js"></script>
-<script src="/assets/js/runtime~main.33a05ad1.js"></script>
+<script src="/assets/js/runtime~main.96f9bfc8.js"></script>
 <script src="/assets/js/main.da980c98.js"></script>
 <script src="/assets/js/1.4743d043.js"></script>
 <script src="/assets/js/2.d8ee9dd6.js"></script>
diff --git a/docs/08-InterviewQuestions/index.html b/docs/08-InterviewQuestions/index.html
index ab8214f..6d14bd1 100644
--- a/docs/08-InterviewQuestions/index.html
+++ b/docs/08-InterviewQuestions/index.html
@@ -7,7 +7,7 @@
 <link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="THE DATA ENGINEERING COOKBOOK Blog RSS Feed">
 <link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="THE DATA ENGINEERING COOKBOOK Blog Atom Feed"><title data-react-helmet="true">08-InterviewQuestions | THE DATA ENGINEERING COOKBOOK</title><meta data-react-helmet="true" name="twitter:card" content="summary_large_image"><meta data-react-helmet="true" name="docusaurus_locale" content="en"><meta data-react-helmet="true" name="docusaurus_version" content="current"><meta data-react-helmet="true" name="docusaurus_tag" content="docs-default-current"><meta data-react-helmet="true" property="og:title" content="08-InterviewQuestions | THE DATA ENGINEERING COOKBOOK"><meta data-react-helmet="true" name="description" content="1001 Data Engineering Interview Questions"><meta data-react-helmet="true" property="og:description" content="1001 Data Engineering Interview Questions"><meta data-react-helmet="true" property="og:url" content="http://cookbook.learndataengineering.com/docs/08-InterviewQuestions"><link data-react-helmet="true" rel="shortcut icon" href="/images/CookbookCover.jpg"><link data-react-helmet="true" rel="alternate" href="http://cookbook.learndataengineering.com/docs/08-InterviewQuestions" hreflang="x-default"><link data-react-helmet="true" rel="canonical" href="http://cookbook.learndataengineering.com/docs/08-InterviewQuestions"><link rel="stylesheet" href="/assets/css/styles.5ba6ab3f.css">
 <link rel="preload" href="/assets/js/styles.4aacfb65.js" as="script">
-<link rel="preload" href="/assets/js/runtime~main.33a05ad1.js" as="script">
+<link rel="preload" href="/assets/js/runtime~main.96f9bfc8.js" as="script">
 <link rel="preload" href="/assets/js/main.da980c98.js" as="script">
 <link rel="preload" href="/assets/js/1.4743d043.js" as="script">
 <link rel="preload" href="/assets/js/2.d8ee9dd6.js" as="script">
@@ -46,7 +46,7 @@
 considered when choosing a DB?)</p></li><li><p>How to choose right storage for different data consumers? It&#x27;s
 always a tricky question</p></li></ul><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="apache-flink"></a>Apache Flink<a class="hash-link" href="#apache-flink" title="Direct link to heading">#</a></h3><ul><li><p>What is Flink used for?</p></li><li><p>Flink vs Spark?</p></li></ul><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="github"></a>GitHub<a class="hash-link" href="#github" title="Direct link to heading">#</a></h3><ul><li><p>What are branches?</p></li><li><p>What are commits?</p></li><li><p>What&#x27;s a pull request?</p></li></ul><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="devops"></a>Dev/Ops<a class="hash-link" href="#devops" title="Direct link to heading">#</a></h3><ul><li><p>What is continuous integration?</p></li><li><p>What is continuous deployment?</p></li><li><p>Difference CI/CD</p></li></ul><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="development--agile"></a>Development / Agile<a class="hash-link" href="#development--agile" title="Direct link to heading">#</a></h3><ul><li><p>What is Scrum?</p></li><li><p>What is OKR?</p></li><li><p>What is Jira and what is it used for?</p></li></ul></div></article><div class="margin-vert--lg"><nav class="pagination-nav" aria-label="Blog list page navigation"><div class="pagination-nav__item"><a class="pagination-nav__link" href="/docs/07-DataSources"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">« 07-DataSources</div></a></div><div class="pagination-nav__item pagination-nav__item--next"><a class="pagination-nav__link" href="/docs/09-BooksAndCourses"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">09-BooksAndCourses »</div></a></div></nav></div></div></div><div class="col col--3"><div class="tableOfContents_35-E thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#contents" class="table-of-contents__link">Contents:</a><ul><li><a href="#python" class="table-of-contents__link">Python</a></li></ul></li><li><a href="#sql" class="table-of-contents__link">SQL</a></li><li><a href="#integrate" class="table-of-contents__link">Integrate</a><ul><li><a href="#apis" class="table-of-contents__link">APIs</a></li></ul></li><li><a href="#message-queues" class="table-of-contents__link">Message queues</a><ul><li><a href="#distributed-message-queues" class="table-of-contents__link">Distributed Message Queues</a></li><li><a href="#message-queues-fifo" class="table-of-contents__link">Message Queues (Fifo)</a></li><li><a href="#caches" class="table-of-contents__link">Caches</a></li></ul></li><li><a href="#data-processing" class="table-of-contents__link">Data Processing</a><ul><li><a href="#etl" class="table-of-contents__link">ETL</a></li><li><a href="#stream-processing" class="table-of-contents__link">Stream processing</a></li><li><a href="#batch-processing" class="table-of-contents__link">Batch processing</a></li><li><a href="#processing-frameworks" class="table-of-contents__link">Processing Frameworks</a></li><li><a href="#scheduling" class="table-of-contents__link">Scheduling</a></li><li><a href="#docker-and-kubernetes" class="table-of-contents__link">Docker and Kubernetes</a></li><li><a href="#ci-cd" class="table-of-contents__link">CI-CD</a></li></ul></li><li><a href="#data-storage" class="table-of-contents__link">Data Storage</a><ul><li><a href="#relational-databases" class="table-of-contents__link">Relational Databases</a></li><li><a href="#nosql" class="table-of-contents__link">NoSQL</a></li><li><a href="#analytical-stores" class="table-of-contents__link">Analytical Stores</a></li><li><a href="#relational-modeling" class="table-of-contents__link">Relational Modeling</a></li><li><a href="#dimensional-data-modeling" class="table-of-contents__link">Dimensional Data Modeling</a></li><li><a href="#data-lakes" class="table-of-contents__link">Data Lakes</a></li></ul></li><li><a href="#data-platforms" class="table-of-contents__link">Data Platforms</a><ul><li><a href="#aws" class="table-of-contents__link">AWS</a></li><li><a href="#gcp" class="table-of-contents__link">GCP</a></li><li><a href="#azure" class="table-of-contents__link">Azure</a></li><li><a href="#snowflake" class="table-of-contents__link">Snowflake</a></li></ul></li><li><a href="#live-streams" class="table-of-contents__link">Live Streams</a></li><li><a href="#all-interview-questions" class="table-of-contents__link">All Interview Questions</a><ul><li><a href="#sql-dbs" class="table-of-contents__link">SQL DBs</a></li><li><a href="#the-cloud" class="table-of-contents__link">The Cloud</a></li><li><a href="#linux" class="table-of-contents__link">Linux</a></li><li><a href="#big-data" class="table-of-contents__link">Big Data</a></li><li><a href="#kafka" class="table-of-contents__link">Kafka</a></li><li><a href="#coding" class="table-of-contents__link">Coding</a></li><li><a href="#nosql-dbs" class="table-of-contents__link">NoSQL DBs</a></li><li><a href="#hadoop" class="table-of-contents__link">Hadoop</a></li><li><a href="#lambda-architecture" class="table-of-contents__link">Lambda Architecture</a></li><li><a href="#data-warehouse--data-lake" class="table-of-contents__link">Data Warehouse &amp; Data Lake</a></li><li><a href="#apis-rest" class="table-of-contents__link">APIs (REST)</a></li><li><a href="#apache-spark" class="table-of-contents__link">Apache Spark</a></li><li><a href="#mapreduce" class="table-of-contents__link">MapReduce</a></li><li><a href="#docker--kubernetes" class="table-of-contents__link">Docker &amp; Kubernetes</a></li><li><a href="#data-pipelines" class="table-of-contents__link">Data Pipelines</a></li><li><a href="#airflow-1" class="table-of-contents__link">Airflow</a></li><li><a href="#datavisualization" class="table-of-contents__link">DataVisualization</a></li><li><a href="#securityprivacy" class="table-of-contents__link">Security/Privacy</a></li><li><a href="#distributed-systems" class="table-of-contents__link">Distributed Systems</a></li><li><a href="#apache-flink" class="table-of-contents__link">Apache Flink</a></li><li><a href="#github" class="table-of-contents__link">GitHub</a></li><li><a href="#devops" class="table-of-contents__link">Dev/Ops</a></li><li><a href="#development--agile" class="table-of-contents__link">Development / Agile</a></li></ul></li></ul></div></div></div></div></main></div></div><footer class="footer"><div class="container"><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2024 Andreas Kretz. Built by Kristijan Bakaric with Docusaurus.</div></div></div></footer></div>
 <script src="/assets/js/styles.4aacfb65.js"></script>
-<script src="/assets/js/runtime~main.33a05ad1.js"></script>
+<script src="/assets/js/runtime~main.96f9bfc8.js"></script>
 <script src="/assets/js/main.da980c98.js"></script>
 <script src="/assets/js/1.4743d043.js"></script>
 <script src="/assets/js/2.d8ee9dd6.js"></script>
diff --git a/docs/09-BooksAndCourses/index.html b/docs/09-BooksAndCourses/index.html
index f521638..076635e 100644
--- a/docs/09-BooksAndCourses/index.html
+++ b/docs/09-BooksAndCourses/index.html
@@ -7,7 +7,7 @@
 <link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="THE DATA ENGINEERING COOKBOOK Blog RSS Feed">
 <link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="THE DATA ENGINEERING COOKBOOK Blog Atom Feed"><title data-react-helmet="true">09-BooksAndCourses | THE DATA ENGINEERING COOKBOOK</title><meta data-react-helmet="true" name="twitter:card" content="summary_large_image"><meta data-react-helmet="true" name="docusaurus_locale" content="en"><meta data-react-helmet="true" name="docusaurus_version" content="current"><meta data-react-helmet="true" name="docusaurus_tag" content="docs-default-current"><meta data-react-helmet="true" property="og:title" content="09-BooksAndCourses | THE DATA ENGINEERING COOKBOOK"><meta data-react-helmet="true" name="description" content="Recommended Books, Courses, and Podcasts"><meta data-react-helmet="true" property="og:description" content="Recommended Books, Courses, and Podcasts"><meta data-react-helmet="true" property="og:url" content="http://cookbook.learndataengineering.com/docs/09-BooksAndCourses"><link data-react-helmet="true" rel="shortcut icon" href="/images/CookbookCover.jpg"><link data-react-helmet="true" rel="alternate" href="http://cookbook.learndataengineering.com/docs/09-BooksAndCourses" hreflang="x-default"><link data-react-helmet="true" rel="canonical" href="http://cookbook.learndataengineering.com/docs/09-BooksAndCourses"><link rel="stylesheet" href="/assets/css/styles.5ba6ab3f.css">
 <link rel="preload" href="/assets/js/styles.4aacfb65.js" as="script">
-<link rel="preload" href="/assets/js/runtime~main.33a05ad1.js" as="script">
+<link rel="preload" href="/assets/js/runtime~main.96f9bfc8.js" as="script">
 <link rel="preload" href="/assets/js/main.da980c98.js" as="script">
 <link rel="preload" href="/assets/js/1.4743d043.js" as="script">
 <link rel="preload" href="/assets/js/2.d8ee9dd6.js" as="script">
@@ -22,7 +22,7 @@
 <nav aria-label="Skip navigation links"><button type="button" tabindex="0" class="skipToContent_1oUP">Skip to main content</button></nav><nav class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><div aria-label="Navigation bar toggle" class="navbar__toggle" role="button" tabindex="0"><svg aria-label="Menu" width="30" height="30" viewBox="0 0 30 30" role="img" focusable="false"><title>Menu</title><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></div><a class="navbar__brand" href="/"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--light_3UqQ navbar__logo"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--dark_hz6m navbar__logo"><strong class="navbar__title">Data Engineering Cookbook</strong></a><a class="navbar__item navbar__link" href="/docs/01-Introduction">Cookbook</a></div><div class="navbar__items navbar__items--right"><a href="https://learndataengineering.com/" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">Data Engineering Academy</a><a href="https://medium.com/plumbersofdatascience" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">Plumbers Of Data Science</a><a href="https://github.com/andkret/Cookbook" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">GitHub</a></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div><div class="navbar-sidebar"><div class="navbar-sidebar__brand"><a class="navbar__brand" href="/"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--light_3UqQ navbar__logo"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--dark_hz6m navbar__logo"><strong class="navbar__title">Data Engineering Cookbook</strong></a></div><div class="navbar-sidebar__items"><div class="menu"><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" href="/docs/01-Introduction">Cookbook</a></li><li class="menu__list-item"><a href="https://learndataengineering.com/" target="_blank" rel="noopener noreferrer" class="menu__link">Data Engineering Academy</a></li><li class="menu__list-item"><a href="https://medium.com/plumbersofdatascience" target="_blank" rel="noopener noreferrer" class="menu__link">Plumbers Of Data Science</a></li><li class="menu__list-item"><a href="https://github.com/andkret/Cookbook" target="_blank" rel="noopener noreferrer" class="menu__link">GitHub</a></li></ul></div></div></div></nav><div class="main-wrapper"><div class="docPage_31aa"><div class="docSidebarContainer_3Kbt" role="complementary"><div class="sidebar_15mo"><div class="menu menu--responsive thin-scrollbar menu_Bmed"><button aria-label="Open Menu" aria-haspopup="true" class="button button--secondary button--sm menu__button" type="button"><svg aria-label="Menu" class="sidebarMenuIcon_fgN0" width="24" height="24" viewBox="0 0 30 30" role="img" focusable="false"><title>Menu</title><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><ul class="menu__list"><li class="menu__list-item"><a class="menu__link menu__link--sublist menu__link--active" href="#!">Data Engineering</a><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/01-Introduction">01-Introduction</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/02-BasicSkills">02-BasicSkills</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/03-AdvancedSkills">03-AdvancedSkills</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/04-HandsOnCourse">04-HandsOnCourse</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/05-CaseStudies">05-CaseStudies</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/06-BestPracticesCloud">06-BestPracticesCloud</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/07-DataSources">07-DataSources</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/08-InterviewQuestions">08-InterviewQuestions</a></li><li class="menu__list-item"><a aria-current="page" class="menu__link menu__link--active active" tabindex="0" href="/docs/09-BooksAndCourses">09-BooksAndCourses</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/10-Updates">10-Updates</a></li></ul></li></ul></div></div></div><main class="docMainContainer_3ufF"><div class="container padding-vert--lg docItemWrapper_3FMP"><div class="row"><div class="col docItemCol_3FnS"><div class="docItemContainer_33ec"><article><header><h1 class="docTitle_3a4h">09-BooksAndCourses</h1></header><div class="markdown"><h1><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="recommended-books-courses-and-podcasts"></a>Recommended Books, Courses, and Podcasts<a class="hash-link" href="#recommended-books-courses-and-podcasts" title="Direct link to heading">#</a></h1><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="contents"></a>Contents<a class="hash-link" href="#contents" title="Direct link to heading">#</a></h2><ul><li><a href="/docs/09-BooksAndCourses#about-books-and-courses">About Books and Courses</a></li><li><a href="/docs/09-BooksAndCourses#books">Books</a><ul><li><a href="/docs/09-BooksAndCourses#books-languages">Languages</a></li><li><a href="/docs/09-BooksAndCourses#books-data-science-tools">Data Science Tools</a></li><li><a href="/docs/09-BooksAndCourses#Books-Business">Business</a></li><li><a href="/docs/09-BooksAndCourses#Community-Recommendations">Community Recommendations</a></li></ul></li><li><a href="/docs/09-BooksAndCourses#Online-Courses">Online Courses</a><ul><li><a href="/docs/09-BooksAndCourses#Preparation-courses">Preparation courses</a></li><li><a href="/docs/09-BooksAndCourses#Data-engineering-courses">Data engineering courses</a></li></ul></li><li><a href="/docs/09-BooksAndCourses#Certifications">Certifications</a></li><li><a href="/docs/09-BooksAndCourses#Podcasts">Podcasts</a></li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="about-books-courses-and-podcasts"></a>About Books, Courses, and Podcasts<a class="hash-link" href="#about-books-courses-and-podcasts" title="Direct link to heading">#</a></h2><p>This is a collection of books and courses I can recommend personally.
 They are great for every data engineering learner.</p><p>I either have used or own these books during my professional work.</p><p>I also looked into every online course personally.</p><p>If you want to buy a book or course and support my work, please use one of my links below. They are all affiliate marketing links that help me fund this passion.</p><p>Of course all this comes at no additional expense to you, but it helps me a lot.</p><p>You can find even more interesting books and my whole podcast equipment on my Amazon store:</p><p><a href="https://www.amazon.com/shop/plumbersofdatascience" target="_blank" rel="noopener noreferrer">Go to the Amazon store</a></p><p>PS: Don&#x27;t just get a book and expect to learn everything</p><ul><li>Course certificates alone help you nothing</li><li>Have a purpose in mind, like a small project</li><li>Great for use at work</li></ul><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="books"></a>Books<a class="hash-link" href="#books" title="Direct link to heading">#</a></h2><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="languages"></a>Languages<a class="hash-link" href="#languages" title="Direct link to heading">#</a></h3><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="java"></a>Java<a class="hash-link" href="#java" title="Direct link to heading">#</a></h4><p><a href="https://amzn.to/2MgYp8h" target="_blank" rel="noopener noreferrer">Learning Java: A Bestselling Hands-On Java Tutorial</a></p><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="python"></a>Python<a class="hash-link" href="#python" title="Direct link to heading">#</a></h4><p><a href="https://amzn.to/2MdpM34" target="_blank" rel="noopener noreferrer">Learning Python, 5th Edition</a></p><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="scala"></a>Scala<a class="hash-link" href="#scala" title="Direct link to heading">#</a></h4><p><a href="https://amzn.to/2VIpww5" target="_blank" rel="noopener noreferrer">Programming Scala: Scalability = Functional Programming + Objects</a></p><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="swift"></a>Swift<a class="hash-link" href="#swift" title="Direct link to heading">#</a></h4><p><a href="https://amzn.to/31hDN4e" target="_blank" rel="noopener noreferrer">Learning Swift: Building Apps for macOS, iOS, and Beyond</a></p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="data-science-tools"></a>Data Science Tools<a class="hash-link" href="#data-science-tools" title="Direct link to heading">#</a></h3><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="apache-spark"></a>Apache Spark<a class="hash-link" href="#apache-spark" title="Direct link to heading">#</a></h4><p><a href="https://amzn.to/31mtAUg" target="_blank" rel="noopener noreferrer">Learning Spark: Lightning-Fast Big Data Analysis</a></p><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="apache-kafka"></a>Apache Kafka<a class="hash-link" href="#apache-kafka" title="Direct link to heading">#</a></h4><p><a href="https://amzn.to/35uiSOJ" target="_blank" rel="noopener noreferrer">Kafka Streams in Action: Real-time apps and microservices with the Kafka Streams API</a></p><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="apache-hadoop"></a>Apache Hadoop<a class="hash-link" href="#apache-hadoop" title="Direct link to heading">#</a></h4><p><a href="https://amzn.to/2VNzf4n" target="_blank" rel="noopener noreferrer">Hadoop: The Definitive Guide: Storage and Analysis at Internet Scale</a></p><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="apache-hbase"></a>Apache HBase<a class="hash-link" href="#apache-hbase" title="Direct link to heading">#</a></h4><p><a href="https://amzn.to/2BbiyGz" target="_blank" rel="noopener noreferrer">HBase: The Definitive Guide: Random Access to Your Planet-Size Data</a></p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="business"></a>Business<a class="hash-link" href="#business" title="Direct link to heading">#</a></h3><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="the-lean-startup"></a>The Lean Startup<a class="hash-link" href="#the-lean-startup" title="Direct link to heading">#</a></h4><p><a href="https://amzn.to/2Meyv5e" target="_blank" rel="noopener noreferrer">The Lean Startup: How Today&#x27;s Entrepreneurs Use Continuous Innovation to Create Radically Successful Businesses</a></p><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="zero-to-one"></a>Zero to One<a class="hash-link" href="#zero-to-one" title="Direct link to heading">#</a></h4><p><a href="https://amzn.to/2BbBwgr" target="_blank" rel="noopener noreferrer">Zero to One: Notes on Startups, or How to Build the Future</a></p><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="the-innovators-dilemma"></a>The Innovators Dilemma<a class="hash-link" href="#the-innovators-dilemma" title="Direct link to heading">#</a></h4><p><a href="https://amzn.to/31eGZ0k" target="_blank" rel="noopener noreferrer">The Innovator&#x27;s Dilemma: When New Technologies Cause Great Firms to Fail (Management of Innovation and Change)</a></p><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="crossing-the-chasm"></a>Crossing the Chasm<a class="hash-link" href="#crossing-the-chasm" title="Direct link to heading">#</a></h4><p><a href="https://amzn.to/2IU7QZs" target="_blank" rel="noopener noreferrer">Crossing the Chasm, 3rd Edition (Collins Business Essentials)</a></p><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="crush-it"></a>Crush It!<a class="hash-link" href="#crush-it" title="Direct link to heading">#</a></h4><p><a href="https://amzn.to/33xe7Su" target="_blank" rel="noopener noreferrer">Crush It!: Why Now Is The Time To Cash In On Your Passion</a></p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="community-recommendations"></a>Community Recommendations<a class="hash-link" href="#community-recommendations" title="Direct link to heading">#</a></h3><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="designing-data-intensive-applications"></a>Designing Data-Intensive Applications<a class="hash-link" href="#designing-data-intensive-applications" title="Direct link to heading">#</a></h4><p>&quot;In my opinion, the knowledge contained in this book differentiates a data engineer from a software engineer or a developer. The book strikes a good balance between breadth and depth of discussion on data engineering topics, as well as the tradeoffs we must make due to working with massive amounts of data.&quot; -- David Lee on LinkedIn</p><p><a href="https://amzn.to/2MIqTqJ" target="_blank" rel="noopener noreferrer">Designing Data-Intensive Applications: The Big Ideas Behind Reliable, Scalable, and Maintainable Systems</a></p><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="online-courses"></a>Online Courses<a class="hash-link" href="#online-courses" title="Direct link to heading">#</a></h2><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="preparation-courses"></a>Preparation courses<a class="hash-link" href="#preparation-courses" title="Direct link to heading">#</a></h3><table><thead><tr><th>Course name</th><th>Course description</th><th>Course URL</th></tr></thead><tbody><tr><td>The Bits and Bytes of Computer Networking</td><td>This course is designed to provide a full overview of computer networking. We’ll cover everything from the fundamentals of modern networking technologies and protocols to an overview of the cloud to practical applications and network troubleshooting.</td><td><a href="https://www.coursera.org/learn/computer-networking" target="_blank" rel="noopener noreferrer">https://www.coursera.org/learn/computer-networking</a></td></tr><tr><td>Learn SQL | Codecademy</td><td>In this SQL course, you&#x27;ll learn how to manage large datasets and analyze real data using the standard data management language.</td><td><a href="https://www.codecademy.com/learn/learn-sql" target="_blank" rel="noopener noreferrer">https://www.codecademy.com/learn/learn-sql</a></td></tr><tr><td>Learn Python 3 | Codecademy</td><td>Learn the basics of Python 3, one of the most powerful, versatile, and in-demand programming languages today.</td><td><a href="https://www.codecademy.com/learn/learn-python-3" target="_blank" rel="noopener noreferrer">https://www.codecademy.com/learn/learn-python-3</a></td></tr></tbody></table><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="data-engineering-courses"></a>Data engineering courses<a class="hash-link" href="#data-engineering-courses" title="Direct link to heading">#</a></h3><table><thead><tr><th>Course name</th><th>Course description</th><th>Course URL</th></tr></thead><tbody><tr><td><strong>1. Data Engineering Basics</strong></td><td></td><td></td></tr><tr><td>Introduction to Data Engineering</td><td>Introduction to Data Engineering with over 1 hour of videos including my journey here.</td><td><a href="https://learndataengineering.com/p/introduction-to-data-engineering" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/introduction-to-data-engineering</a></td></tr><tr><td>Computer Science Fundamentals</td><td>A complete guide of topics and resources you should know as a Data Engineer.</td><td><a href="https://learndataengineering.com/p/data-engineering-fundamentals" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/data-engineering-fundamentals</a></td></tr><tr><td>Introduction to Python</td><td>Learn all the fundamentals of Python to start coding quick</td><td><a href="https://learndataengineering.com/p/introduction-to-python" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/introduction-to-python</a></td></tr><tr><td>Python for Data Engineers</td><td>Learn all the Python topics a Data Engineer needs even if you don&#x27;t have a coding background</td><td><a href="https://learndataengineering.com/p/python-for-data-engineers" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/python-for-data-engineers</a></td></tr><tr><td>Docker Fundamentals</td><td>Learn all the fundamental Docker concepts with hands-on examples</td><td><a href="https://learndataengineering.com/p/docker-fundamentals" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/docker-fundamentals</a></td></tr><tr><td>Successful Job Application</td><td>Everything you need to get your dream job in Data Engineering.</td><td><a href="https://learndataengineering.com/p/successful-job-application" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/successful-job-application</a></td></tr><tr><td>Data Preparation &amp; Cleaning for ML</td><td>All you need for preparing data to enable Machine Learning.</td><td><a href="https://learndataengineering.com/p/data-preparation-and-cleaning-for-ml" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/data-preparation-and-cleaning-for-ml</a></td></tr><tr><td><strong>2. Platform &amp; Pipeline Design Fundamentals</strong></td><td></td><td></td></tr><tr><td>Data Platform And Pipeline Design</td><td>Learn how to build data pipelines with templates and examples for Azure, GCP and Hadoop.</td><td><a href="https://learndataengineering.com/p/data-pipeline-design" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/data-pipeline-design</a></td></tr><tr><td>Platform &amp; Pipelines Security</td><td>Learn the important security fundamentals for Data Engineering</td><td><a href="https://learndataengineering.com/p/platform-pipeline-security" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/platform-pipeline-security</a></td></tr><tr><td>Choosing Data Stores</td><td>Learn the different types of data stores and when to use which.</td><td><a href="https://learndataengineering.com/p/choosing-data-stores" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/choosing-data-stores</a></td></tr><tr><td>Schema Design Data Stores</td><td>Learn how to design schemas for SQL, NoSQL and Data Warehouses.</td><td><a href="https://learndataengineering.com/p/data-modeling" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/data-modeling</a></td></tr><tr><td><strong>3. Fundamental Tools</strong></td><td></td><td></td></tr><tr><td>Building APIs with FastAPI</td><td>Learn the fundamentals of designing, creating and deploying APIs with FastAPI and Docker</td><td><a href="https://learndataengineering.com/p/apis-with-fastapi-course" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/apis-with-fastapi-course</a></td></tr><tr><td>Apache Kafka Fundamentals</td><td>Learn the fundamentals of Apache Kafka</td><td><a href="https://learndataengineering.com/p/apache-kafka-fundamentals" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/apache-kafka-fundamentals</a></td></tr><tr><td>Apache Spark Fundamentals</td><td>Apache Spark quick start course in Python with Jupyter notebooks, DataFrames, SparkSQL and RDDs.</td><td><a href="https://learndataengineering.com/p/learning-apache-spark-fundamentals" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/learning-apache-spark-fundamentals</a></td></tr><tr><td>Data Engineering on Databricks</td><td>Everything you need to get started with Databricks. From setup to building ETL pipelines &amp; warehousing.</td><td><a href="https://learndataengineering.com/p/data-engineering-on-databricks" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/data-engineering-on-databricks</a></td></tr><tr><td>MongoDB Fundamentals</td><td>Learn how to use MongoDB</td><td><a href="https://learndataengineering.com/p/mongodb-fundamentals-course" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/mongodb-fundamentals-course</a></td></tr><tr><td>Log Analysis with Elasticsearch</td><td>Learn how to monitor and debug your data pipelines</td><td><a href="https://learndataengineering.com/p/log-analysis-with-elasticsearch" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/log-analysis-with-elasticsearch</a></td></tr><tr><td>Airflow Workflow Orchestration</td><td>Learn how to orchestrate your data pipelines with Apache Airflow</td><td><a href="https://learndataengineering.com/p/learn-apache-airflow" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/learn-apache-airflow</a></td></tr><tr><td>Snowflake for Data Engineers</td><td>Everything you need to get started with Snowflake</td><td><a href="https://learndataengineering.com/p/snowflake-for-data-engineers" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/snowflake-for-data-engineers</a></td></tr><tr><td>dbt for Data Engineers</td><td>Everything you need to work with dbt and Snowflake</td><td><a href="https://learndataengineering.com/p/dbt-for-data-engineers" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/dbt-for-data-engineers</a></td></tr><tr><td><strong>4. Full Hands-On Example Projects</strong></td><td></td><td></td></tr><tr><td>Data Engineering on AWS</td><td>Full 5 hours course with complete example project. Building stream and batch processing pipelines on AWS.</td><td><a href="https://learndataengineering.com/p/data-engineering-on-aws" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/data-engineering-on-aws</a></td></tr><tr><td>Data Engineering on Azure</td><td>Ingest, Store, Process, Serve and Visualize Streams of Data by Building Streaming Data Pipelines in Azure.</td><td><a href="https://learndataengineering.com/p/build-streaming-data-pipelines-in-azure" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/build-streaming-data-pipelines-in-azure</a></td></tr><tr><td>Data Engineering on GCP</td><td>Everything you need to start with Google Cloud.</td><td><a href="https://learndataengineering.com/p/data-engineering-on-gcp" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/data-engineering-on-gcp</a></td></tr><tr><td>Modern Data Warehouses &amp; Data Lakes</td><td>How to integrate a Data Lake with a Data Warehouse and query data directly from files</td><td><a href="https://learndataengineering.com/p/modern-data-warehouses" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/modern-data-warehouses</a></td></tr><tr><td>Machine Learning &amp; Containerization On AWS</td><td>Build a app that analyzes the sentiment of tweets and visualizing them on a user interface hosted as container</td><td><a href="https://learndataengineering.com/p/ml-on-aws" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/ml-on-aws</a></td></tr><tr><td>Contact Tracing with Elasticsearch</td><td>Track 100,000 users in San Francisco using Elasticsearch and an interactive Streamlit user interface</td><td><a href="https://learndataengineering.com/p/contact-tracing-with-elasticsearch" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/contact-tracing-with-elasticsearch</a></td></tr><tr><td>Document Streaming Project</td><td>Document Streaming with FastAPI, Kafka, Spark Streaming, MongoDB and Streamlit</td><td><a href="https://learndataengineering.com/p/document-streaming" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/document-streaming</a></td></tr><tr><td>Storing &amp; Visualizing Time Series Data with InfluxDB and Grafana</td><td>Learn how to use InfluxDB to store time series data and visualize interactive dashboards with Grafana</td><td><a href="https://learndataengineering.com/p/time-series-influxdb-grafana" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/time-series-influxdb-grafana</a></td></tr><tr><td>Data Engineering with Hadoop</td><td>Hadoop Project with HDFS, YARN, MapReduce, Hive and Sqoop!</td><td><a href="https://learndataengineering.com/p/data-engineering-with-hadoop" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/data-engineering-with-hadoop</a></td></tr><tr><td>Dockerized ETL</td><td>Learn how quickly set up a simple ETL script with AWS TDengine &amp; Grafana</td><td><a href="https://learndataengineering.com/p/timeseries-etl-with-aws-tdengine-grafana" target="_blank" rel="noopener noreferrer">https://learndataengineering.com/p/timeseries-etl-with-aws-tdengine-grafana</a></td></tr></tbody></table><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="certifications"></a>Certifications<a class="hash-link" href="#certifications" title="Direct link to heading">#</a></h2><p>Here&#x27;s a list of great certifications that you can do on AWS and Azure. We left out GCP here, because the adoption of AWS and Azure is a lot higher and that&#x27;s why I recommend to start with one of these. The costs are usually for doing the certification tests. We also added the level and prerequisites to make it easier for you make the decision which one fits for you.</p><table><thead><tr><th>Platform</th><th>Certification Name</th><th>Price</th><th>Level</th><th>Prerequisite Experience</th><th>URL</th></tr></thead><tbody><tr><td>AWS</td><td>AWS Certified Cloud Practitioner (maybe)</td><td>100</td><td>Beginner</td><td>Familiarity with the AWS platform is recommended but not required.</td><td><a href="https://aws.amazon.com/certification/certified-cloud-practitioner/" target="_blank" rel="noopener noreferrer">Link</a></td></tr><tr><td>AWS</td><td>AWS Certified Solutions Architect</td><td>300</td><td>Expert</td><td>AWS Certified Solutions Architect - Professional is intended for individuals with two or more years of hands-on experience designing and deploying cloud architecture on AWS.</td><td><a href="https://aws.amazon.com/certification/certified-solutions-architect-professional/?ch=sec&amp;sec=rmg&amp;d=1" target="_blank" rel="noopener noreferrer">Link</a></td></tr><tr><td>AWS</td><td>AWS Certified Solutions Architect</td><td>150</td><td>Intermediate</td><td>This is an ideal starting point for candidates with AWS Cloud or strong on-premises IT experience. This exam does not require deep hands-on coding experience, although familiarity with basic programming concepts would be an advantage.</td><td><a href="https://aws.amazon.com/certification/certified-solutions-architect-associate/" target="_blank" rel="noopener noreferrer">Link</a></td></tr><tr><td>AWS</td><td>AWS Certified Data Engineer</td><td>150</td><td>Intermediate</td><td>The ideal candidate for this exam has the equivalent of 2-3 years of experience in data engineering or data architecture and a minimum of 1-2 years of hands-on experience with AWS services.</td><td><a href="https://aws.amazon.com/certification/certified-data-engineer-associate/" target="_blank" rel="noopener noreferrer">Link</a></td></tr><tr><td>Azure</td><td>Microsoft Certified: Azure Cosmos DB Developer Specialty</td><td>165</td><td>Intermediate</td><td></td><td><a href="https://learn.microsoft.com/en-us/credentials/certifications/azure-cosmos-db-developer-specialty/?practice-assessment-type=certification" target="_blank" rel="noopener noreferrer">Link</a></td></tr><tr><td>Azure</td><td>Microsoft Certified: Azure Data Engineer Associate - DP 203</td><td>165</td><td>Intermediate</td><td></td><td><a href="https://learn.microsoft.com/en-us/credentials/certifications/azure-data-engineer/?practice-assessment-type=certification" target="_blank" rel="noopener noreferrer">Link</a></td></tr><tr><td>Azure</td><td>Microsoft Certified: Azure Data Fundamentals</td><td>99</td><td>Beginner</td><td></td><td><a href="https://learn.microsoft.com/en-us/credentials/certifications/azure-data-fundamentals/?practice-assessment-type=certification" target="_blank" rel="noopener noreferrer">Link</a></td></tr><tr><td>Azure</td><td>Microsoft Certified: Azure Database Administrator Associate</td><td>165</td><td>Intermediate</td><td></td><td><a href="https://learn.microsoft.com/en-us/credentials/certifications/azure-database-administrator-associate/?practice-assessment-type=certification" target="_blank" rel="noopener noreferrer">Link</a></td></tr><tr><td>Azure</td><td>Microsoft Certified: Azure Developer Associate</td><td>165</td><td>Intermediate</td><td></td><td><a href="https://learn.microsoft.com/en-us/credentials/certifications/azure-developer/?practice-assessment-type=certification" target="_blank" rel="noopener noreferrer">Link</a></td></tr><tr><td>Azure</td><td>Microsoft Certified: Azure Fundamentals</td><td>99</td><td>Beginner</td><td></td><td><a href="https://learn.microsoft.com/en-us/credentials/certifications/azure-fundamentals/?practice-assessment-type=certification" target="_blank" rel="noopener noreferrer">Link</a></td></tr><tr><td>Azure</td><td>Microsoft Certified: Azure Solutions Architect Expert</td><td>165</td><td>Expert</td><td>Microsoft Certified: Azure Administrator Associate certification</td><td><a href="https://learn.microsoft.com/en-us/credentials/certifications/azure-solutions-architect/" target="_blank" rel="noopener noreferrer">Link</a></td></tr><tr><td>Azure</td><td>Microsoft Certified: Fabric Analytics Engineer Associate</td><td>165</td><td>Intermediate</td><td></td><td><a href="https://learn.microsoft.com/en-us/credentials/certifications/fabric-analytics-engineer-associate/?practice-assessment-type=certification" target="_blank" rel="noopener noreferrer">Link</a></td></tr><tr><td>Azure</td><td>Microsoft Certified: Fabric Data Engineer Associate</td><td>165</td><td>Intermediate</td><td></td><td><a href="https://learn.microsoft.com/en-us/credentials/certifications/fabric-data-engineer-associate/" target="_blank" rel="noopener noreferrer">Link</a></td></tr><tr><td>Azure</td><td>Microsoft Certified: Power BI Data Analyst Associate</td><td>165</td><td>Intermediate</td><td></td><td><a href="https://learn.microsoft.com/en-us/credentials/certifications/data-analyst-associate/?practice-assessment-type=certification" target="_blank" rel="noopener noreferrer">Link</a></td></tr></tbody></table><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="podcasts"></a>Podcasts<a class="hash-link" href="#podcasts" title="Direct link to heading">#</a></h2><p>Top five podcasts by the number of episodes created.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="super-data-science"></a>Super Data Science<a class="hash-link" href="#super-data-science" title="Direct link to heading">#</a></h3><p><a href="https://podcasts.apple.com/us/podcast/super-data-science/id1163599059" target="_blank" rel="noopener noreferrer">The latest machine learning, A.I., and data career topics from across both academia and industry are brought to you by host Dr. Jon Krohn on the Super Data Science Podcast.</a></p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="data-skeptic"></a>Data Skeptic<a class="hash-link" href="#data-skeptic" title="Direct link to heading">#</a></h3><p><a href="https://podcasts.apple.com/us/podcast/data-skeptic/id890348705" target="_blank" rel="noopener noreferrer">The Data Skeptic Podcast features interviews and discussion of topics related to data science, statistics, machine learning, artificial intelligence and the like, all from the perspective of applying critical thinking and the scientific method to evaluate the veracity of claims and efficacy of approaches.</a></p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="data-engineering-podcast"></a>Data Engineering Podcast<a class="hash-link" href="#data-engineering-podcast" title="Direct link to heading">#</a></h3><p><a href="https://podcasts.apple.com/us/podcast/data-engineering-podcast/id1193040557?mt=2" target="_blank" rel="noopener noreferrer">This show goes behind the scenes for the tools, techniques, and difficulties associated with the discipline of data engineering. Databases, workflows, automation, and data manipulation are just some of the topics that you will find here.</a></p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="roaring-elephant-bitesized-big-tech"></a>Roaring Elephant BiteSized Big Tech<a class="hash-link" href="#roaring-elephant-bitesized-big-tech" title="Direct link to heading">#</a></h3><p><a href="https://roaringelephant.org/" target="_blank" rel="noopener noreferrer">A weekly community podcast about Big Technology with a focus on Open Source, Advanced Analytics and other modern magic.</a></p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="sql-data-partners-podcast"></a>SQL Data Partners Podcast<a class="hash-link" href="#sql-data-partners-podcast" title="Direct link to heading">#</a></h3><p><a href="https://podcasts.apple.com/us/podcast/sql-data-partners-podcast/id1027394388" target="_blank" rel="noopener noreferrer">Hosted by Carlos L Chacon, the SQL Data Partners Podcast focuses on Microsoft data platform related topics mixed with a sprinkling of professional development. Carlos and guests discuss new and familiar features and ideas and how you might apply them in your environments.</a></p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="complete-list"></a>Complete list<a class="hash-link" href="#complete-list" title="Direct link to heading">#</a></h3><table><thead><tr><th>Host name</th><th>Podcast name</th><th>Access podcast</th></tr></thead><tbody><tr><td>Jon Krohn</td><td>Super Data Science</td><td><a href="https://www.superdatascience.com/podcast" target="_blank" rel="noopener noreferrer">https://www.superdatascience.com/podcast</a></td></tr><tr><td>Kyle Polich</td><td>Data Skeptic</td><td><a href="https://dataskeptic.com/" target="_blank" rel="noopener noreferrer">https://dataskeptic.com/</a></td></tr><tr><td>Tobias Macey</td><td>Data Engineering Podcast</td><td><a href="https://www.dataengineeringpodcast.com/" target="_blank" rel="noopener noreferrer">https://www.dataengineeringpodcast.com/</a></td></tr><tr><td>Dave Russell</td><td>Roaring Elephant - Bite-Sized Big Tech</td><td><a href="https://roaringelephant.org/" target="_blank" rel="noopener noreferrer">https://roaringelephant.org/</a></td></tr><tr><td>Carlos L Chacon</td><td>SQL Data Partners Podcast</td><td><a href="https://sqldatapartners.com/podcast/" target="_blank" rel="noopener noreferrer">https://sqldatapartners.com/podcast/</a></td></tr><tr><td>Jason Himmelstein</td><td>BIFocal - Clarifying Business Intelligence</td><td><a href="https://bifocal.show/" target="_blank" rel="noopener noreferrer">https://bifocal.show/</a></td></tr><tr><td>Scott Hirleman</td><td>Data Mesh Radio</td><td><a href="https://daappod.com/data-mesh-radio/" target="_blank" rel="noopener noreferrer">https://daappod.com/data-mesh-radio/</a></td></tr><tr><td>Jonathan Schwabish</td><td>PolicyViz</td><td><a href="https://policyviz.com/podcast/" target="_blank" rel="noopener noreferrer">https://policyviz.com/podcast/</a></td></tr><tr><td>Al Martin</td><td>Making Data Simple</td><td><a href="https://www.ibm.com/blogs/journey-to-ai/2021/02/making-data-simple-this-week-we-continue-our-discussion-on-data-framework-and-what-is-meant-by-data-framework/" target="_blank" rel="noopener noreferrer">https://www.ibm.com/blogs/journey-to-ai/2021/02/making-data-simple-this-week-we-continue-our-discussion-on-data-framework-and-what-is-meant-by-data-framework/</a></td></tr><tr><td>John David Ariansen</td><td>How to Get an Analytics Job</td><td><a href="https://www.silvertoneanalytics.com/how-to-get-an-analytics-job/" target="_blank" rel="noopener noreferrer">https://www.silvertoneanalytics.com/how-to-get-an-analytics-job/</a></td></tr><tr><td>Moritz Stefaner</td><td>Data Stories</td><td><a href="https://datastori.es/" target="_blank" rel="noopener noreferrer">https://datastori.es/</a></td></tr><tr><td>Hilary Parker</td><td>Not So Standard Deviations</td><td><a href="https://nssdeviations.com/" target="_blank" rel="noopener noreferrer">https://nssdeviations.com/</a></td></tr><tr><td>Ben Lorica</td><td>The Data Exchange with Ben Lorica</td><td><a href="https://thedataexchange.media/author/bglorica/" target="_blank" rel="noopener noreferrer">https://thedataexchange.media/author/bglorica/</a></td></tr><tr><td>Juan Sequeda</td><td>Catalog &amp; Cocktails</td><td><a href="https://data.world/resources/podcasts/" target="_blank" rel="noopener noreferrer">https://data.world/resources/podcasts/</a></td></tr><tr><td>Wayne Eckerson</td><td>Secrets of Data Analytics Leaders</td><td><a href="https://www.eckerson.com/podcasts/secrets-of-data-analytics-leaders" target="_blank" rel="noopener noreferrer">https://www.eckerson.com/podcasts/secrets-of-data-analytics-leaders</a></td></tr><tr><td>Guy Glantser</td><td>SQL Server Radio</td><td><a href="https://www.sqlserverradio.com/" target="_blank" rel="noopener noreferrer">https://www.sqlserverradio.com/</a></td></tr><tr><td>Eitan Blumin</td><td>SQL Server Radio</td><td><a href="https://www.sqlserverradio.com/" target="_blank" rel="noopener noreferrer">https://www.sqlserverradio.com/</a></td></tr><tr><td>Jason Tan</td><td>The Analytics Show</td><td><a href="https://ddalabs.ai/the-analytics-show/" target="_blank" rel="noopener noreferrer">https://ddalabs.ai/the-analytics-show/</a></td></tr><tr><td>Hugo Bowne-Anderson</td><td>DataFramed</td><td><a href="https://www.datacamp.com/podcast" target="_blank" rel="noopener noreferrer">https://www.datacamp.com/podcast</a></td></tr><tr><td>Kostas Pardalis</td><td>The Data Stack Show</td><td><a href="https://datastackshow.com/" target="_blank" rel="noopener noreferrer">https://datastackshow.com/</a></td></tr><tr><td>Eric Dodds</td><td>The Data Stack Show</td><td><a href="https://datastackshow.com/" target="_blank" rel="noopener noreferrer">https://datastackshow.com/</a></td></tr><tr><td>Catherine King</td><td>The Business of Data Podcast</td><td><a href="https://podcasts.apple.com/gb/podcast/the-business-of-data-podcast/id1528796448" target="_blank" rel="noopener noreferrer">https://podcasts.apple.com/gb/podcast/the-business-of-data-podcast/id1528796448</a></td></tr><tr><td></td><td>The Business of Data</td><td><a href="https://business-of-data.com/podcasts/" target="_blank" rel="noopener noreferrer">https://business-of-data.com/podcasts/</a></td></tr><tr><td>James Le</td><td>Datacast</td><td><a href="https://datacast.simplecast.com/" target="_blank" rel="noopener noreferrer">https://datacast.simplecast.com/</a></td></tr><tr><td>Mike Delgado</td><td>DataTalk</td><td><a href="https://podcasts.apple.com/us/podcast/datatalk/id1398548129" target="_blank" rel="noopener noreferrer">https://podcasts.apple.com/us/podcast/datatalk/id1398548129</a></td></tr><tr><td>Matt Housley</td><td>Monday Morning Data Chat</td><td><a href="https://podcasts.apple.com/us/podcast/monday-morning-data-chat/id1565154727" target="_blank" rel="noopener noreferrer">https://podcasts.apple.com/us/podcast/monday-morning-data-chat/id1565154727</a></td></tr><tr><td>Francesco Gadaleta</td><td>Data Science at Home</td><td><a href="https://datascienceathome.com/" target="_blank" rel="noopener noreferrer">https://datascienceathome.com/</a></td></tr><tr><td>Alli Torban</td><td>Data Viz Today</td><td><a href="https://dataviztoday.com/" target="_blank" rel="noopener noreferrer">https://dataviztoday.com/</a></td></tr><tr><td>Steve Jones</td><td>Voice of the DBA</td><td><a href="https://voiceofthedba.com/" target="_blank" rel="noopener noreferrer">https://voiceofthedba.com/</a></td></tr><tr><td>Lea Pica</td><td>The Present Beyond Measure Show: Data Storytelling, Presentation &amp; Visualization</td><td><a href="https://leapica.com/podcast/" target="_blank" rel="noopener noreferrer">https://leapica.com/podcast/</a></td></tr><tr><td>Samir Sharma</td><td>The Data Strategy Show</td><td><a href="https://podcasts.apple.com/us/podcast/the-data-strategy-show/id1515194422" target="_blank" rel="noopener noreferrer">https://podcasts.apple.com/us/podcast/the-data-strategy-show/id1515194422</a></td></tr><tr><td>Cindi Howson</td><td>The Data Chief</td><td><a href="https://www.thoughtspot.com/data-chief/podcast" target="_blank" rel="noopener noreferrer">https://www.thoughtspot.com/data-chief/podcast</a></td></tr><tr><td>Cole Nussbaumer Knaflic</td><td>storytelling with data podcast</td><td><a href="https://storytellingwithdata.libsyn.com/" target="_blank" rel="noopener noreferrer">https://storytellingwithdata.libsyn.com/</a></td></tr><tr><td>Margot Gerritsen</td><td>Women in Data Science</td><td><a href="https://www.widsconference.org/podcast.html" target="_blank" rel="noopener noreferrer">https://www.widsconference.org/podcast.html</a></td></tr><tr><td>Jonas Christensen</td><td>Leaders of Analytics</td><td><a href="https://www.leadersofanalytics.com/episode/the-future-of-analytics-leadership-with-john-thompson" target="_blank" rel="noopener noreferrer">https://www.leadersofanalytics.com/episode/the-future-of-analytics-leadership-with-john-thompson</a></td></tr><tr><td>Matt Brady</td><td>ZUMA: Data For Good</td><td><a href="https://www.youtube.com/@zuma-dataforgood" target="_blank" rel="noopener noreferrer">https://www.youtube.com/@zuma-dataforgood</a></td></tr><tr><td>Julia Schottenstein</td><td>The Analytics Engineering Podcast</td><td><a href="https://roundup.getdbt.com/s/the-analytics-engineering-podcast" target="_blank" rel="noopener noreferrer">https://roundup.getdbt.com/s/the-analytics-engineering-podcast</a></td></tr><tr><td></td><td>Data Unlocked</td><td><a href="https://dataunlocked.buzzsprout.com/" target="_blank" rel="noopener noreferrer">https://dataunlocked.buzzsprout.com/</a></td></tr><tr><td>Boris Jabes</td><td>The Sequel Show</td><td><a href="https://www.thesequelshow.com/" target="_blank" rel="noopener noreferrer">https://www.thesequelshow.com/</a></td></tr><tr><td></td><td>Data Radicals</td><td><a href="https://www.alation.com/podcast/" target="_blank" rel="noopener noreferrer">https://www.alation.com/podcast/</a></td></tr><tr><td>Nicola Askham</td><td>The Data Governance</td><td><a href="https://www.nicolaaskham.com/podcast" target="_blank" rel="noopener noreferrer">https://www.nicolaaskham.com/podcast</a></td></tr><tr><td>Boaz Farkash</td><td>The Data Engineering Show</td><td><a href="https://www.dataengineeringshow.com/" target="_blank" rel="noopener noreferrer">https://www.dataengineeringshow.com/</a></td></tr><tr><td>Bob Haffner</td><td>The Engineering Side of Data</td><td><a href="https://podcasts.apple.com/us/podcast/the-engineering-side-of-data/id1566999533" target="_blank" rel="noopener noreferrer">https://podcasts.apple.com/us/podcast/the-engineering-side-of-data/id1566999533</a></td></tr><tr><td>Dan Linstedt</td><td>Data Vault Alliance</td><td><a href="https://datavaultalliance.com/category/news/podcasts/" target="_blank" rel="noopener noreferrer">https://datavaultalliance.com/category/news/podcasts/</a></td></tr><tr><td>Dustin Schimek</td><td>Data Ideas</td><td><a href="https://podcasts.apple.com/us/podcast/data-ideas/id1650322207" target="_blank" rel="noopener noreferrer">https://podcasts.apple.com/us/podcast/data-ideas/id1650322207</a></td></tr><tr><td>Alex Merced</td><td>The datanation</td><td><a href="https://podcasts.apple.com/be/podcast/the-datanation-podcast-podcast-for-data-engineers/id1608638822" target="_blank" rel="noopener noreferrer">https://podcasts.apple.com/be/podcast/the-datanation-podcast-podcast-for-data-engineers/id1608638822</a></td></tr><tr><td>Thomas Bustos</td><td>Let&#x27;s Talk AI</td><td><a href="https://www.youtube.com/@lets-talk-ai" target="_blank" rel="noopener noreferrer">https://www.youtube.com/@lets-talk-ai</a></td></tr><tr><td>Jahanvee Narang</td><td>Decoding Data Analytics</td><td><a href="https://www.youtube.com/@decodingdataanalytics/videos" target="_blank" rel="noopener noreferrer">https://www.youtube.com/@decodingdataanalytics/videos</a></td></tr></tbody></table></div></article><div class="margin-vert--lg"><nav class="pagination-nav" aria-label="Blog list page navigation"><div class="pagination-nav__item"><a class="pagination-nav__link" href="/docs/08-InterviewQuestions"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">« 08-InterviewQuestions</div></a></div><div class="pagination-nav__item pagination-nav__item--next"><a class="pagination-nav__link" href="/docs/10-Updates"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">10-Updates »</div></a></div></nav></div></div></div><div class="col col--3"><div class="tableOfContents_35-E thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#contents" class="table-of-contents__link">Contents</a></li><li><a href="#about-books-courses-and-podcasts" class="table-of-contents__link">About Books, Courses, and Podcasts</a></li><li><a href="#books" class="table-of-contents__link">Books</a><ul><li><a href="#languages" class="table-of-contents__link">Languages</a></li><li><a href="#data-science-tools" class="table-of-contents__link">Data Science Tools</a></li><li><a href="#business" class="table-of-contents__link">Business</a></li><li><a href="#community-recommendations" class="table-of-contents__link">Community Recommendations</a></li></ul></li><li><a href="#online-courses" class="table-of-contents__link">Online Courses</a><ul><li><a href="#preparation-courses" class="table-of-contents__link">Preparation courses</a></li><li><a href="#data-engineering-courses" class="table-of-contents__link">Data engineering courses</a></li></ul></li><li><a href="#certifications" class="table-of-contents__link">Certifications</a></li><li><a href="#podcasts" class="table-of-contents__link">Podcasts</a><ul><li><a href="#super-data-science" class="table-of-contents__link">Super Data Science</a></li><li><a href="#data-skeptic" class="table-of-contents__link">Data Skeptic</a></li><li><a href="#data-engineering-podcast" class="table-of-contents__link">Data Engineering Podcast</a></li><li><a href="#roaring-elephant-bitesized-big-tech" class="table-of-contents__link">Roaring Elephant BiteSized Big Tech</a></li><li><a href="#sql-data-partners-podcast" class="table-of-contents__link">SQL Data Partners Podcast</a></li><li><a href="#complete-list" class="table-of-contents__link">Complete list</a></li></ul></li></ul></div></div></div></div></main></div></div><footer class="footer"><div class="container"><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2024 Andreas Kretz. Built by Kristijan Bakaric with Docusaurus.</div></div></div></footer></div>
 <script src="/assets/js/styles.4aacfb65.js"></script>
-<script src="/assets/js/runtime~main.33a05ad1.js"></script>
+<script src="/assets/js/runtime~main.96f9bfc8.js"></script>
 <script src="/assets/js/main.da980c98.js"></script>
 <script src="/assets/js/1.4743d043.js"></script>
 <script src="/assets/js/2.d8ee9dd6.js"></script>
diff --git a/docs/10-Updates/index.html b/docs/10-Updates/index.html
index 29724c6..c8fecfe 100644
--- a/docs/10-Updates/index.html
+++ b/docs/10-Updates/index.html
@@ -7,7 +7,7 @@
 <link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="THE DATA ENGINEERING COOKBOOK Blog RSS Feed">
 <link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="THE DATA ENGINEERING COOKBOOK Blog Atom Feed"><title data-react-helmet="true">10-Updates | THE DATA ENGINEERING COOKBOOK</title><meta data-react-helmet="true" name="twitter:card" content="summary_large_image"><meta data-react-helmet="true" name="docusaurus_locale" content="en"><meta data-react-helmet="true" name="docusaurus_version" content="current"><meta data-react-helmet="true" name="docusaurus_tag" content="docs-default-current"><meta data-react-helmet="true" property="og:title" content="10-Updates | THE DATA ENGINEERING COOKBOOK"><meta data-react-helmet="true" name="description" content="Updates"><meta data-react-helmet="true" property="og:description" content="Updates"><meta data-react-helmet="true" property="og:url" content="http://cookbook.learndataengineering.com/docs/10-Updates"><link data-react-helmet="true" rel="shortcut icon" href="/images/CookbookCover.jpg"><link data-react-helmet="true" rel="alternate" href="http://cookbook.learndataengineering.com/docs/10-Updates" hreflang="x-default"><link data-react-helmet="true" rel="canonical" href="http://cookbook.learndataengineering.com/docs/10-Updates"><link rel="stylesheet" href="/assets/css/styles.5ba6ab3f.css">
 <link rel="preload" href="/assets/js/styles.4aacfb65.js" as="script">
-<link rel="preload" href="/assets/js/runtime~main.33a05ad1.js" as="script">
+<link rel="preload" href="/assets/js/runtime~main.96f9bfc8.js" as="script">
 <link rel="preload" href="/assets/js/main.da980c98.js" as="script">
 <link rel="preload" href="/assets/js/1.4743d043.js" as="script">
 <link rel="preload" href="/assets/js/2.d8ee9dd6.js" as="script">
@@ -15,13 +15,13 @@
 <link rel="preload" href="/assets/js/19.f0bdf1f6.js" as="script">
 <link rel="preload" href="/assets/js/935f2afb.cff03dc6.js" as="script">
 <link rel="preload" href="/assets/js/17896441.aacbb830.js" as="script">
-<link rel="preload" href="/assets/js/a4b6b237.47726173.js" as="script">
+<link rel="preload" href="/assets/js/a4b6b237.394c5a42.js" as="script">
 </head>
 <body>
 <script>!function(){function e(e){document.documentElement.setAttribute("data-theme",e)}var t=function(){var e=null;try{e=localStorage.getItem("theme")}catch(e){}return e}();null!==t?e(t):window.matchMedia("(prefers-color-scheme: dark)").matches?e("dark"):(window.matchMedia("(prefers-color-scheme: light)").matches,e("light"))}()</script><div id="__docusaurus">
-<nav aria-label="Skip navigation links"><button type="button" tabindex="0" class="skipToContent_1oUP">Skip to main content</button></nav><nav class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><div aria-label="Navigation bar toggle" class="navbar__toggle" role="button" tabindex="0"><svg aria-label="Menu" width="30" height="30" viewBox="0 0 30 30" role="img" focusable="false"><title>Menu</title><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></div><a class="navbar__brand" href="/"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--light_3UqQ navbar__logo"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--dark_hz6m navbar__logo"><strong class="navbar__title">Data Engineering Cookbook</strong></a><a class="navbar__item navbar__link" href="/docs/01-Introduction">Cookbook</a></div><div class="navbar__items navbar__items--right"><a href="https://learndataengineering.com/" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">Data Engineering Academy</a><a href="https://medium.com/plumbersofdatascience" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">Plumbers Of Data Science</a><a href="https://github.com/andkret/Cookbook" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">GitHub</a></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div><div class="navbar-sidebar"><div class="navbar-sidebar__brand"><a class="navbar__brand" href="/"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--light_3UqQ navbar__logo"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--dark_hz6m navbar__logo"><strong class="navbar__title">Data Engineering Cookbook</strong></a></div><div class="navbar-sidebar__items"><div class="menu"><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" href="/docs/01-Introduction">Cookbook</a></li><li class="menu__list-item"><a href="https://learndataengineering.com/" target="_blank" rel="noopener noreferrer" class="menu__link">Data Engineering Academy</a></li><li class="menu__list-item"><a href="https://medium.com/plumbersofdatascience" target="_blank" rel="noopener noreferrer" class="menu__link">Plumbers Of Data Science</a></li><li class="menu__list-item"><a href="https://github.com/andkret/Cookbook" target="_blank" rel="noopener noreferrer" class="menu__link">GitHub</a></li></ul></div></div></div></nav><div class="main-wrapper"><div class="docPage_31aa"><div class="docSidebarContainer_3Kbt" role="complementary"><div class="sidebar_15mo"><div class="menu menu--responsive thin-scrollbar menu_Bmed"><button aria-label="Open Menu" aria-haspopup="true" class="button button--secondary button--sm menu__button" type="button"><svg aria-label="Menu" class="sidebarMenuIcon_fgN0" width="24" height="24" viewBox="0 0 30 30" role="img" focusable="false"><title>Menu</title><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><ul class="menu__list"><li class="menu__list-item"><a class="menu__link menu__link--sublist menu__link--active" href="#!">Data Engineering</a><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/01-Introduction">01-Introduction</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/02-BasicSkills">02-BasicSkills</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/03-AdvancedSkills">03-AdvancedSkills</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/04-HandsOnCourse">04-HandsOnCourse</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/05-CaseStudies">05-CaseStudies</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/06-BestPracticesCloud">06-BestPracticesCloud</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/07-DataSources">07-DataSources</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/08-InterviewQuestions">08-InterviewQuestions</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/09-BooksAndCourses">09-BooksAndCourses</a></li><li class="menu__list-item"><a aria-current="page" class="menu__link menu__link--active active" tabindex="0" href="/docs/10-Updates">10-Updates</a></li></ul></li></ul></div></div></div><main class="docMainContainer_3ufF"><div class="container padding-vert--lg docItemWrapper_3FMP"><div class="row"><div class="col docItemCol_3FnS"><div class="docItemContainer_33ec"><article><header><h1 class="docTitle_3a4h">10-Updates</h1></header><div class="markdown"><h1><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="updates"></a>Updates<a class="hash-link" href="#updates" title="Direct link to heading">#</a></h1><p>What&#x27;s new? Here you can find a list of all the updates with links to the sections</p><ul><li><strong>2024-11-23</strong><ul><li>Prepared a GenAI RAG example project that you can run on your own computer without internet. It uses Ollama with Mistral model and ElasticSearch. Working on a way of creating embeddings from pdf files and inserting them into ElsaticSearch for queries <a href="/docs/04-HandsOnCourse#genai-retrieval-augmented-generation-with-ollama-and-elasticsearch">click here</a></li></ul></li></ul><ul><li><strong>2024-11-23</strong><ul><li>Added an overview of AWS and Azure cloud certifications for Data Engineers. From beginners to experts <a href="/docs/09-BooksAndCourses#Certifications">click here</a></li></ul></li></ul><ul><li><strong>2024-07-31</strong><ul><li>Added 10 platform architecture react videos I did to the &quot;Best Practices&quot; section. This way you get a better feeling of what companies are doing and which tools they use <a href="/docs/06-BestPracticesCloud#best-practices">click here</a></li></ul></li></ul><ul><li><strong>2024-07-17</strong><ul><li>Added 20 API interview questoins and their answers <a href="/docs/08-InterviewQuestions#apis">click here</a></li><li>Added 10 Python interview questions and their answers <a href="/docs/03-AdvancedSkills#python">click here</a></li></ul></li></ul><ul><li><strong>2024-07-08</strong><ul><li>Added large article about Snowflake and dbt for Data Engineers <a href="/docs/03-AdvancedSkills#analytical-data-stores">click here</a></li><li>Added new secton &quot;Analytical Data Stores&quot; to Advanced skills with the Snowflake &amp; dbt infos.</li><li>Put SQL and NoSQL datastores into a new section &quot;Transactional Data Stores&quot;</li></ul></li></ul><ul><li><strong>2024-03-20</strong><ul><li>Added roadmap for Software Engineers / Computer Scientists <a href="/docs/01-Introduction#roadmap-for-software-engineers">click here</a></li><li>Added many questions and answers from my interview on the Super Data Science Podcast (plus links to YouTube and the Podcast) <a href="/docs/01-Introduction#Interview-with-Andreas-on-the-Super-Data-Science-Podcast">click here</a></li></ul></li></ul><ul><li><strong>2024-03-13</strong><ul><li>Added &quot;How to become a Senior Data Engineer&quot; live stream series as a blog post with images shown in the live streams and the links to the videos. <a href="/docs/01-Introduction#how-to-become-a-senior-data-engineer">click here</a></li></ul></li></ul><ul><li><strong>2024-03-08</strong><ul><li>Included Data Engineering skills matrix into the introduction with link to the live stream. <a href="/docs/01-Introduction#data-engineers-skills-matrix">click here</a></li></ul></li></ul><ul><li><strong>2024-03-01</strong><ul><li>Added updates section</li><li>Reworked the Hands-on courses section with 5 free courses / tutorials from Andreas on YouTube <a href="/docs/04-HandsOnCourse">click here</a></li></ul></li></ul><ul><li><strong>2024-02-28</strong><ul><li>Added Data Engineering Roadmap for Data Scientists: <a href="/docs/01-Introduction#roadmap-for-data-scientists">click here</a></li></ul></li></ul><ul><li><strong>2024-02-25</strong><ul><li>Data Engineering Roadmap for Software Engineers: <a href="/docs/01-Introduction#roadmap-for-software-engineers">click here</a></li></ul></li></ul><ul><li><strong>2024-02-20</strong><ul><li>Data Engineering Roadmap for Data Analysts: <a href="/docs/01-Introduction#roadmap-for-data-analysts">click here</a></li></ul></li></ul></div></article><div class="margin-vert--lg"><nav class="pagination-nav" aria-label="Blog list page navigation"><div class="pagination-nav__item"><a class="pagination-nav__link" href="/docs/09-BooksAndCourses"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">« 09-BooksAndCourses</div></a></div><div class="pagination-nav__item pagination-nav__item--next"></div></nav></div></div></div><div class="col col--3"><div class="tableOfContents_35-E thin-scrollbar"></div></div></div></div></main></div></div><footer class="footer"><div class="container"><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2024 Andreas Kretz. Built by Kristijan Bakaric with Docusaurus.</div></div></div></footer></div>
+<nav aria-label="Skip navigation links"><button type="button" tabindex="0" class="skipToContent_1oUP">Skip to main content</button></nav><nav class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><div aria-label="Navigation bar toggle" class="navbar__toggle" role="button" tabindex="0"><svg aria-label="Menu" width="30" height="30" viewBox="0 0 30 30" role="img" focusable="false"><title>Menu</title><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></div><a class="navbar__brand" href="/"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--light_3UqQ navbar__logo"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--dark_hz6m navbar__logo"><strong class="navbar__title">Data Engineering Cookbook</strong></a><a class="navbar__item navbar__link" href="/docs/01-Introduction">Cookbook</a></div><div class="navbar__items navbar__items--right"><a href="https://learndataengineering.com/" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">Data Engineering Academy</a><a href="https://medium.com/plumbersofdatascience" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">Plumbers Of Data Science</a><a href="https://github.com/andkret/Cookbook" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">GitHub</a></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div><div class="navbar-sidebar"><div class="navbar-sidebar__brand"><a class="navbar__brand" href="/"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--light_3UqQ navbar__logo"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--dark_hz6m navbar__logo"><strong class="navbar__title">Data Engineering Cookbook</strong></a></div><div class="navbar-sidebar__items"><div class="menu"><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" href="/docs/01-Introduction">Cookbook</a></li><li class="menu__list-item"><a href="https://learndataengineering.com/" target="_blank" rel="noopener noreferrer" class="menu__link">Data Engineering Academy</a></li><li class="menu__list-item"><a href="https://medium.com/plumbersofdatascience" target="_blank" rel="noopener noreferrer" class="menu__link">Plumbers Of Data Science</a></li><li class="menu__list-item"><a href="https://github.com/andkret/Cookbook" target="_blank" rel="noopener noreferrer" class="menu__link">GitHub</a></li></ul></div></div></div></nav><div class="main-wrapper"><div class="docPage_31aa"><div class="docSidebarContainer_3Kbt" role="complementary"><div class="sidebar_15mo"><div class="menu menu--responsive thin-scrollbar menu_Bmed"><button aria-label="Open Menu" aria-haspopup="true" class="button button--secondary button--sm menu__button" type="button"><svg aria-label="Menu" class="sidebarMenuIcon_fgN0" width="24" height="24" viewBox="0 0 30 30" role="img" focusable="false"><title>Menu</title><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><ul class="menu__list"><li class="menu__list-item"><a class="menu__link menu__link--sublist menu__link--active" href="#!">Data Engineering</a><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/01-Introduction">01-Introduction</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/02-BasicSkills">02-BasicSkills</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/03-AdvancedSkills">03-AdvancedSkills</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/04-HandsOnCourse">04-HandsOnCourse</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/05-CaseStudies">05-CaseStudies</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/06-BestPracticesCloud">06-BestPracticesCloud</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/07-DataSources">07-DataSources</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/08-InterviewQuestions">08-InterviewQuestions</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/09-BooksAndCourses">09-BooksAndCourses</a></li><li class="menu__list-item"><a aria-current="page" class="menu__link menu__link--active active" tabindex="0" href="/docs/10-Updates">10-Updates</a></li></ul></li></ul></div></div></div><main class="docMainContainer_3ufF"><div class="container padding-vert--lg docItemWrapper_3FMP"><div class="row"><div class="col docItemCol_3FnS"><div class="docItemContainer_33ec"><article><header><h1 class="docTitle_3a4h">10-Updates</h1></header><div class="markdown"><h1><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_2LWZ" id="updates"></a>Updates<a class="hash-link" href="#updates" title="Direct link to heading">#</a></h1><p>What&#x27;s new? Here you can find a list of all the updates with links to the sections</p><ul><li><strong>2024-11-23</strong><ul><li>Prepared a GenAI RAG example project that you can run on your own computer without internet. It uses Ollama with Mistral model and Elasticsearch. Working on a way of creating embeddings from pdf files and inserting them into Elsaticsearch for queries <a href="/docs/04-HandsOnCourse#genai-retrieval-augmented-generation-with-ollama-and-elasticsearch">click here</a></li></ul></li></ul><ul><li><strong>2024-11-23</strong><ul><li>Added an overview of AWS and Azure cloud certifications for Data Engineers. From beginners to experts <a href="/docs/09-BooksAndCourses#Certifications">click here</a></li></ul></li></ul><ul><li><strong>2024-07-31</strong><ul><li>Added 10 platform architecture react videos I did to the &quot;Best Practices&quot; section. This way you get a better feeling of what companies are doing and which tools they use <a href="/docs/06-BestPracticesCloud#best-practices">click here</a></li></ul></li></ul><ul><li><strong>2024-07-17</strong><ul><li>Added 20 API interview questoins and their answers <a href="/docs/08-InterviewQuestions#apis">click here</a></li><li>Added 10 Python interview questions and their answers <a href="/docs/03-AdvancedSkills#python">click here</a></li></ul></li></ul><ul><li><strong>2024-07-08</strong><ul><li>Added large article about Snowflake and dbt for Data Engineers <a href="/docs/03-AdvancedSkills#analytical-data-stores">click here</a></li><li>Added new secton &quot;Analytical Data Stores&quot; to Advanced skills with the Snowflake &amp; dbt infos.</li><li>Put SQL and NoSQL datastores into a new section &quot;Transactional Data Stores&quot;</li></ul></li></ul><ul><li><strong>2024-03-20</strong><ul><li>Added roadmap for Software Engineers / Computer Scientists <a href="/docs/01-Introduction#roadmap-for-software-engineers">click here</a></li><li>Added many questions and answers from my interview on the Super Data Science Podcast (plus links to YouTube and the Podcast) <a href="/docs/01-Introduction#Interview-with-Andreas-on-the-Super-Data-Science-Podcast">click here</a></li></ul></li></ul><ul><li><strong>2024-03-13</strong><ul><li>Added &quot;How to become a Senior Data Engineer&quot; live stream series as a blog post with images shown in the live streams and the links to the videos. <a href="/docs/01-Introduction#how-to-become-a-senior-data-engineer">click here</a></li></ul></li></ul><ul><li><strong>2024-03-08</strong><ul><li>Included Data Engineering skills matrix into the introduction with link to the live stream. <a href="/docs/01-Introduction#data-engineers-skills-matrix">click here</a></li></ul></li></ul><ul><li><strong>2024-03-01</strong><ul><li>Added updates section</li><li>Reworked the Hands-on courses section with 5 free courses / tutorials from Andreas on YouTube <a href="/docs/04-HandsOnCourse">click here</a></li></ul></li></ul><ul><li><strong>2024-02-28</strong><ul><li>Added Data Engineering Roadmap for Data Scientists: <a href="/docs/01-Introduction#roadmap-for-data-scientists">click here</a></li></ul></li></ul><ul><li><strong>2024-02-25</strong><ul><li>Data Engineering Roadmap for Software Engineers: <a href="/docs/01-Introduction#roadmap-for-software-engineers">click here</a></li></ul></li></ul><ul><li><strong>2024-02-20</strong><ul><li>Data Engineering Roadmap for Data Analysts: <a href="/docs/01-Introduction#roadmap-for-data-analysts">click here</a></li></ul></li></ul></div></article><div class="margin-vert--lg"><nav class="pagination-nav" aria-label="Blog list page navigation"><div class="pagination-nav__item"><a class="pagination-nav__link" href="/docs/09-BooksAndCourses"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">« 09-BooksAndCourses</div></a></div><div class="pagination-nav__item pagination-nav__item--next"></div></nav></div></div></div><div class="col col--3"><div class="tableOfContents_35-E thin-scrollbar"></div></div></div></div></main></div></div><footer class="footer"><div class="container"><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2024 Andreas Kretz. Built by Kristijan Bakaric with Docusaurus.</div></div></div></footer></div>
 <script src="/assets/js/styles.4aacfb65.js"></script>
-<script src="/assets/js/runtime~main.33a05ad1.js"></script>
+<script src="/assets/js/runtime~main.96f9bfc8.js"></script>
 <script src="/assets/js/main.da980c98.js"></script>
 <script src="/assets/js/1.4743d043.js"></script>
 <script src="/assets/js/2.d8ee9dd6.js"></script>
@@ -29,6 +29,6 @@
 <script src="/assets/js/19.f0bdf1f6.js"></script>
 <script src="/assets/js/935f2afb.cff03dc6.js"></script>
 <script src="/assets/js/17896441.aacbb830.js"></script>
-<script src="/assets/js/a4b6b237.47726173.js"></script>
+<script src="/assets/js/a4b6b237.394c5a42.js"></script>
 </body>
 </html>
\ No newline at end of file
diff --git a/index.html b/index.html
index b9bdafc..26db88f 100644
--- a/index.html
+++ b/index.html
@@ -7,7 +7,7 @@
 <link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="THE DATA ENGINEERING COOKBOOK Blog RSS Feed">
 <link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="THE DATA ENGINEERING COOKBOOK Blog Atom Feed"><title data-react-helmet="true">Hello from THE DATA ENGINEERING COOKBOOK | THE DATA ENGINEERING COOKBOOK</title><meta data-react-helmet="true" property="og:title" content="Hello from THE DATA ENGINEERING COOKBOOK | THE DATA ENGINEERING COOKBOOK"><meta data-react-helmet="true" name="twitter:card" content="summary_large_image"><meta data-react-helmet="true" property="og:url" content="http://cookbook.learndataengineering.com/"><meta data-react-helmet="true" name="docusaurus_locale" content="en"><meta data-react-helmet="true" name="docusaurus_tag" content="default"><link data-react-helmet="true" rel="shortcut icon" href="/images/CookbookCover.jpg"><link data-react-helmet="true" rel="canonical" href="http://cookbook.learndataengineering.com/"><link data-react-helmet="true" rel="alternate" href="http://cookbook.learndataengineering.com/" hreflang="x-default"><script data-react-helmet="true">function renderBanner(){var e=document.getElementById("docusaurus-base-url-issue-banner-container");if(e){e.innerHTML='\n<div style="border: thick solid red; background-color: rgb(255, 230, 179); margin: 20px; padding: 20px; font-size: 20px;">\n   <p style="font-weight: bold; font-size: 30px;">Your Docusaurus site did not load properly.</p>\n   <p>A very common reason is a wrong site <a href="https://v2.docusaurus.io/docs/docusaurus.config.js/#baseurl" style="font-weight: bold;">baseUrl configuration</a>.</p>\n   <p>Current configured baseUrl = <span style="font-weight: bold; color: red;">/</span>  (default value)</p>\n   <p>We suggest trying baseUrl = <span id="docusaurus-base-url-issue-banner-suggestion-container" style="font-weight: bold; color: green;"></span></p>\n</div>\n';var n=document.getElementById("docusaurus-base-url-issue-banner-suggestion-container"),r=window.location.pathname,o="/"===r.substr(-1)?r:r+"/";n.innerHTML=o}}document.addEventListener("DOMContentLoaded",renderBanner)</script><link rel="stylesheet" href="/assets/css/styles.5ba6ab3f.css">
 <link rel="preload" href="/assets/js/styles.4aacfb65.js" as="script">
-<link rel="preload" href="/assets/js/runtime~main.33a05ad1.js" as="script">
+<link rel="preload" href="/assets/js/runtime~main.96f9bfc8.js" as="script">
 <link rel="preload" href="/assets/js/main.da980c98.js" as="script">
 <link rel="preload" href="/assets/js/1.4743d043.js" as="script">
 <link rel="preload" href="/assets/js/2.d8ee9dd6.js" as="script">
@@ -17,7 +17,7 @@
 <script>!function(){function e(e){document.documentElement.setAttribute("data-theme",e)}var t=function(){var e=null;try{e=localStorage.getItem("theme")}catch(e){}return e}();null!==t?e(t):window.matchMedia("(prefers-color-scheme: dark)").matches?e("dark"):(window.matchMedia("(prefers-color-scheme: light)").matches,e("light"))}()</script><div id="__docusaurus">
 <div id="docusaurus-base-url-issue-banner-container"></div><nav aria-label="Skip navigation links"><button type="button" tabindex="0" class="skipToContent_1oUP">Skip to main content</button></nav><nav class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><div aria-label="Navigation bar toggle" class="navbar__toggle" role="button" tabindex="0"><svg aria-label="Menu" width="30" height="30" viewBox="0 0 30 30" role="img" focusable="false"><title>Menu</title><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></div><a class="navbar__brand" href="/"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--light_3UqQ navbar__logo"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--dark_hz6m navbar__logo"><strong class="navbar__title">Data Engineering Cookbook</strong></a><a class="navbar__item navbar__link" href="/docs/01-Introduction">Cookbook</a></div><div class="navbar__items navbar__items--right"><a href="https://learndataengineering.com/" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">Data Engineering Academy</a><a href="https://medium.com/plumbersofdatascience" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">Plumbers Of Data Science</a><a href="https://github.com/andkret/Cookbook" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">GitHub</a></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div><div class="navbar-sidebar"><div class="navbar-sidebar__brand"><a class="navbar__brand" href="/"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--light_3UqQ navbar__logo"><img src="/images/CookbookCover.jpg" alt="Site Logo" class="themedImage_1VuW themedImage--dark_hz6m navbar__logo"><strong class="navbar__title">Data Engineering Cookbook</strong></a></div><div class="navbar-sidebar__items"><div class="menu"><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" href="/docs/01-Introduction">Cookbook</a></li><li class="menu__list-item"><a href="https://learndataengineering.com/" target="_blank" rel="noopener noreferrer" class="menu__link">Data Engineering Academy</a></li><li class="menu__list-item"><a href="https://medium.com/plumbersofdatascience" target="_blank" rel="noopener noreferrer" class="menu__link">Plumbers Of Data Science</a></li><li class="menu__list-item"><a href="https://github.com/andkret/Cookbook" target="_blank" rel="noopener noreferrer" class="menu__link">GitHub</a></li></ul></div></div></div></nav><div class="main-wrapper"><header class="hero hero--primary heroBanner_3P7f"><div class="container"><img width="300" height="400" src="images/CookbookCover.jpg"><h1 class="hero__title">THE DATA ENGINEERING COOKBOOK</h1><p class="hero__subtitle">by ANDREAS KRETZ</p><div><a class="button button--outline button--lg" href="/docs/01-Introduction">Read the Cookbook!</a></div></div></header></div><footer class="footer"><div class="container"><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2024 Andreas Kretz. Built by Kristijan Bakaric with Docusaurus.</div></div></div></footer></div>
 <script src="/assets/js/styles.4aacfb65.js"></script>
-<script src="/assets/js/runtime~main.33a05ad1.js"></script>
+<script src="/assets/js/runtime~main.96f9bfc8.js"></script>
 <script src="/assets/js/main.da980c98.js"></script>
 <script src="/assets/js/1.4743d043.js"></script>
 <script src="/assets/js/2.d8ee9dd6.js"></script>

Course name	Course description	Course URL
The Bits and Bytes of Computer Networking	This course is designed to provide a full overview of computer networking. We’ll cover everything from the fundamentals of modern networking technologies and protocols to an overview of the cloud to practical applications and network troubleshooting.	https://www.coursera.org/learn/computer-networking
Learn SQL \| Codecademy	In this SQL course, you'll learn how to manage large datasets and analyze real data using the standard data management language.	https://www.codecademy.com/learn/learn-sql
Learn Python 3 \| Codecademy	Learn the basics of Python 3, one of the most powerful, versatile, and in-demand programming languages today.	https://www.codecademy.com/learn/learn-python-3
Course name	Course description	Course URL
1. Data Engineering Basics
Introduction to Data Engineering	Introduction to Data Engineering with over 1 hour of videos including my journey here.	https://learndataengineering.com/p/introduction-to-data-engineering
Computer Science Fundamentals	A complete guide of topics and resources you should know as a Data Engineer.	https://learndataengineering.com/p/data-engineering-fundamentals
Introduction to Python	Learn all the fundamentals of Python to start coding quick	https://learndataengineering.com/p/introduction-to-python
Python for Data Engineers	Learn all the Python topics a Data Engineer needs even if you don't have a coding background	https://learndataengineering.com/p/python-for-data-engineers
Docker Fundamentals	Learn all the fundamental Docker concepts with hands-on examples	https://learndataengineering.com/p/docker-fundamentals
Successful Job Application	Everything you need to get your dream job in Data Engineering.	https://learndataengineering.com/p/successful-job-application
Data Preparation & Cleaning for ML	All you need for preparing data to enable Machine Learning.	https://learndataengineering.com/p/data-preparation-and-cleaning-for-ml
2. Platform & Pipeline Design Fundamentals
Data Platform And Pipeline Design	Learn how to build data pipelines with templates and examples for Azure, GCP and Hadoop.	https://learndataengineering.com/p/data-pipeline-design
Platform & Pipelines Security	Learn the important security fundamentals for Data Engineering	https://learndataengineering.com/p/platform-pipeline-security
Choosing Data Stores	Learn the different types of data stores and when to use which.	https://learndataengineering.com/p/choosing-data-stores
Schema Design Data Stores	Learn how to design schemas for SQL, NoSQL and Data Warehouses.	https://learndataengineering.com/p/data-modeling
3. Fundamental Tools
Building APIs with FastAPI	Learn the fundamentals of designing, creating and deploying APIs with FastAPI and Docker	https://learndataengineering.com/p/apis-with-fastapi-course
Apache Kafka Fundamentals	Learn the fundamentals of Apache Kafka	https://learndataengineering.com/p/apache-kafka-fundamentals
Apache Spark Fundamentals	Apache Spark quick start course in Python with Jupyter notebooks, DataFrames, SparkSQL and RDDs.	https://learndataengineering.com/p/learning-apache-spark-fundamentals
Data Engineering on Databricks	Everything you need to get started with Databricks. From setup to building ETL pipelines & warehousing.	https://learndataengineering.com/p/data-engineering-on-databricks
MongoDB Fundamentals	Learn how to use MongoDB	https://learndataengineering.com/p/mongodb-fundamentals-course
Log Analysis with Elasticsearch	Learn how to monitor and debug your data pipelines	https://learndataengineering.com/p/log-analysis-with-elasticsearch
Airflow Workflow Orchestration	Learn how to orchestrate your data pipelines with Apache Airflow	https://learndataengineering.com/p/learn-apache-airflow
Snowflake for Data Engineers	Everything you need to get started with Snowflake	https://learndataengineering.com/p/snowflake-for-data-engineers
dbt for Data Engineers	Everything you need to work with dbt and Snowflake	https://learndataengineering.com/p/dbt-for-data-engineers
4. Full Hands-On Example Projects
Data Engineering on AWS	Full 5 hours course with complete example project. Building stream and batch processing pipelines on AWS.	https://learndataengineering.com/p/data-engineering-on-aws
Data Engineering on Azure	Ingest, Store, Process, Serve and Visualize Streams of Data by Building Streaming Data Pipelines in Azure.	https://learndataengineering.com/p/build-streaming-data-pipelines-in-azure
Data Engineering on GCP	Everything you need to start with Google Cloud.	https://learndataengineering.com/p/data-engineering-on-gcp
Modern Data Warehouses & Data Lakes	How to integrate a Data Lake with a Data Warehouse and query data directly from files	https://learndataengineering.com/p/modern-data-warehouses
Machine Learning & Containerization On AWS	Build a app that analyzes the sentiment of tweets and visualizing them on a user interface hosted as container	https://learndataengineering.com/p/ml-on-aws
Contact Tracing with Elasticsearch	Track 100,000 users in San Francisco using Elasticsearch and an interactive Streamlit user interface	https://learndataengineering.com/p/contact-tracing-with-elasticsearch
Document Streaming Project	Document Streaming with FastAPI, Kafka, Spark Streaming, MongoDB and Streamlit	https://learndataengineering.com/p/document-streaming
Storing & Visualizing Time Series Data with InfluxDB and Grafana	Learn how to use InfluxDB to store time series data and visualize interactive dashboards with Grafana	https://learndataengineering.com/p/time-series-influxdb-grafana
Data Engineering with Hadoop	Hadoop Project with HDFS, YARN, MapReduce, Hive and Sqoop!	https://learndataengineering.com/p/data-engineering-with-hadoop
Dockerized ETL	Learn how quickly set up a simple ETL script with AWS TDengine & Grafana	https://learndataengineering.com/p/timeseries-etl-with-aws-tdengine-grafana
Platform	Certification Name	Price	Level	Prerequisite Experience	URL
AWS	AWS Certified Cloud Practitioner (maybe)	100	Beginner	Familiarity with the AWS platform is recommended but not required.	Link
AWS	AWS Certified Solutions Architect	300	Expert	AWS Certified Solutions Architect - Professional is intended for individuals with two or more years of hands-on experience designing and deploying cloud architecture on AWS.	Link
AWS	AWS Certified Solutions Architect	150	Intermediate	This is an ideal starting point for candidates with AWS Cloud or strong on-premises IT experience. This exam does not require deep hands-on coding experience, although familiarity with basic programming concepts would be an advantage.	Link
AWS	AWS Certified Data Engineer	150	Intermediate	The ideal candidate for this exam has the equivalent of 2-3 years of experience in data engineering or data architecture and a minimum of 1-2 years of hands-on experience with AWS services.	Link
Azure	Microsoft Certified: Azure Cosmos DB Developer Specialty	165	Intermediate		Link
Azure	Microsoft Certified: Azure Data Engineer Associate - DP 203	165	Intermediate		Link
Azure	Microsoft Certified: Azure Data Fundamentals	99	Beginner		Link
Azure	Microsoft Certified: Azure Database Administrator Associate	165	Intermediate		Link
Azure	Microsoft Certified: Azure Developer Associate	165	Intermediate		Link
Azure	Microsoft Certified: Azure Fundamentals	99	Beginner		Link
Azure	Microsoft Certified: Azure Solutions Architect Expert	165	Expert	Microsoft Certified: Azure Administrator Associate certification	Link
Azure	Microsoft Certified: Fabric Analytics Engineer Associate	165	Intermediate		Link
Azure	Microsoft Certified: Fabric Data Engineer Associate	165	Intermediate		Link
Azure	Microsoft Certified: Power BI Data Analyst Associate	165	Intermediate		Link
Host name	Podcast name	Access podcast
Jon Krohn	Super Data Science	https://www.superdatascience.com/podcast
Kyle Polich	Data Skeptic	https://dataskeptic.com/
Tobias Macey	Data Engineering Podcast	https://www.dataengineeringpodcast.com/
Dave Russell	Roaring Elephant - Bite-Sized Big Tech	https://roaringelephant.org/
Carlos L Chacon	SQL Data Partners Podcast	https://sqldatapartners.com/podcast/
Jason Himmelstein	BIFocal - Clarifying Business Intelligence	https://bifocal.show/
Scott Hirleman	Data Mesh Radio	https://daappod.com/data-mesh-radio/
Jonathan Schwabish	PolicyViz	https://policyviz.com/podcast/
Al Martin	Making Data Simple	https://www.ibm.com/blogs/journey-to-ai/2021/02/making-data-simple-this-week-we-continue-our-discussion-on-data-framework-and-what-is-meant-by-data-framework/
John David Ariansen	How to Get an Analytics Job	https://www.silvertoneanalytics.com/how-to-get-an-analytics-job/
Moritz Stefaner	Data Stories	https://datastori.es/
Hilary Parker	Not So Standard Deviations	https://nssdeviations.com/
Ben Lorica	The Data Exchange with Ben Lorica	https://thedataexchange.media/author/bglorica/
Juan Sequeda	Catalog & Cocktails	https://data.world/resources/podcasts/
Wayne Eckerson	Secrets of Data Analytics Leaders	https://www.eckerson.com/podcasts/secrets-of-data-analytics-leaders
Guy Glantser	SQL Server Radio	https://www.sqlserverradio.com/
Eitan Blumin	SQL Server Radio	https://www.sqlserverradio.com/
Jason Tan	The Analytics Show	https://ddalabs.ai/the-analytics-show/
Hugo Bowne-Anderson	DataFramed	https://www.datacamp.com/podcast
Kostas Pardalis	The Data Stack Show	https://datastackshow.com/
Eric Dodds	The Data Stack Show	https://datastackshow.com/
Catherine King	The Business of Data Podcast	https://podcasts.apple.com/gb/podcast/the-business-of-data-podcast/id1528796448
	The Business of Data	https://business-of-data.com/podcasts/
James Le	Datacast	https://datacast.simplecast.com/
Mike Delgado	DataTalk	https://podcasts.apple.com/us/podcast/datatalk/id1398548129
Matt Housley	Monday Morning Data Chat	https://podcasts.apple.com/us/podcast/monday-morning-data-chat/id1565154727
Francesco Gadaleta	Data Science at Home	https://datascienceathome.com/
Alli Torban	Data Viz Today	https://dataviztoday.com/
Steve Jones	Voice of the DBA	https://voiceofthedba.com/
Lea Pica	The Present Beyond Measure Show: Data Storytelling, Presentation & Visualization	https://leapica.com/podcast/
Samir Sharma	The Data Strategy Show	https://podcasts.apple.com/us/podcast/the-data-strategy-show/id1515194422
Cindi Howson	The Data Chief	https://www.thoughtspot.com/data-chief/podcast
Cole Nussbaumer Knaflic	storytelling with data podcast	https://storytellingwithdata.libsyn.com/
Margot Gerritsen	Women in Data Science	https://www.widsconference.org/podcast.html
Jonas Christensen	Leaders of Analytics	https://www.leadersofanalytics.com/episode/the-future-of-analytics-leadership-with-john-thompson
Matt Brady	ZUMA: Data For Good	https://www.youtube.com/@zuma-dataforgood
Julia Schottenstein	The Analytics Engineering Podcast	https://roundup.getdbt.com/s/the-analytics-engineering-podcast
	Data Unlocked	https://dataunlocked.buzzsprout.com/
Boris Jabes	The Sequel Show	https://www.thesequelshow.com/
	Data Radicals	https://www.alation.com/podcast/
Nicola Askham	The Data Governance	https://www.nicolaaskham.com/podcast
Boaz Farkash	The Data Engineering Show	https://www.dataengineeringshow.com/
Bob Haffner	The Engineering Side of Data	https://podcasts.apple.com/us/podcast/the-engineering-side-of-data/id1566999533
Dan Linstedt	Data Vault Alliance	https://datavaultalliance.com/category/news/podcasts/
Dustin Schimek	Data Ideas	https://podcasts.apple.com/us/podcast/data-ideas/id1650322207
Alex Merced	The datanation	https://podcasts.apple.com/be/podcast/the-datanation-podcast-podcast-for-data-engineers/id1608638822
Thomas Bustos	Let's Talk AI	https://www.youtube.com/@lets-talk-ai
Jahanvee Narang	Decoding Data Analytics	https://www.youtube.com/@decodingdataanalytics/videos