From a201065e627c838008589d0f11ed0330a4ec0d10 Mon Sep 17 00:00:00 2001 From: rlh1994 <8260415+rlh1994@users.noreply.github.com> Date: Tue, 25 Jun 2024 11:34:42 +0000 Subject: [PATCH 1/3] [create-pull-request] automated change --- src/componentVersions.js | 2 +- .../Schemas/dbtUnified_0.4.4.json | 940 ++++++++++++++++++ 2 files changed, 941 insertions(+), 1 deletion(-) create mode 100644 src/components/JsonSchemaValidator/Schemas/dbtUnified_0.4.4.json diff --git a/src/componentVersions.js b/src/componentVersions.js index 12fa59c5fa..4cd922433e 100644 --- a/src/componentVersions.js +++ b/src/componentVersions.js @@ -42,7 +42,7 @@ export const versions = { // Data Modelling // dbt dbtSnowplowAttribution: '0.2.2', - dbtSnowplowUnified: '0.4.3', + dbtSnowplowUnified: '0.4.4', dbtSnowplowWeb: '1.0.1', dbtSnowplowMobile: '1.0.0', dbtSnowplowUtils: '0.16.7', diff --git a/src/components/JsonSchemaValidator/Schemas/dbtUnified_0.4.4.json b/src/components/JsonSchemaValidator/Schemas/dbtUnified_0.4.4.json new file mode 100644 index 0000000000..f53ecd3a09 --- /dev/null +++ b/src/components/JsonSchemaValidator/Schemas/dbtUnified_0.4.4.json @@ -0,0 +1,940 @@ +{ + "definitions": { + "passthrough_vars": { + "type": "array", + "description": "> Click the plus sign to add a new entry", + "minItems": 0, + "items": { + "title": "Type", + "oneOf": [ + { + "type": "string", + "title": "Column Name" + }, + { + "type": "object", + "title": "SQL & Alias", + "properties": { + "sql": { + "type": "string", + "title": "Field SQL", + "format": "sql", + "order": 1 + }, + "alias": { + "type": "string", + "title": "Field Alias", + "order": 2 + } + }, + "required": [ + "sql", + "alias" + ], + "additionalProperties": false + } + ] + }, + "uniqueItems": true + }, + "aggregation_vars": { + "type": "array", + "description": "> Click the plus sign to add a new entry", + "minItems": 0, + "items": { + "type": "object", + "properties": { + "type": { + "enum": [ + "sum", + "avg", + "min", + "max", + "count", + "countd" + ], + "title": "Aggregation type", + "order": 1 + }, + "field": { + "type": "string", + "title": "Field to aggregate", + "order": 2 + }, + "alias": { + "type": "string", + "title": "Aggregated field alias", + "order": 3 + } + }, + "required": [ + "type", + "field", + "alias" + ], + "additionalProperties": false + }, + "uniqueItems": true + } + }, + "type": "object", + "properties": { + "snowplow__atomic_schema": { + "recommendFullRefresh": true, + "order": 3, + "consoleGroup": "required", + "type": "string", + "title": "Schema", + "description": "Schema (dataset) that contains your atomic events", + "longDescription": "The schema (dataset for BigQuery) that contains your atomic events table.", + "packageDefault": "atomic", + "group": "Warehouse and Tracker" + }, + "snowplow__database": { + "recommendFullRefresh": true, + "order": 1, + "consoleGroup": "required", + "type": "string", + "title": "Database", + "description": "Database that contains your atomic events", + "longDescription": "The database that contains your atomic events table.", + "packageDefault": "target.database", + "group": "Warehouse and Tracker" + }, + "snowplow__dev_target_name": { + "recommendFullRefresh": false, + "order": 87, + "consoleGroup": "advanced", + "type": "string", + "title": "Dev Target", + "description": "Target name of your development environment as defined in your `profiles.yml` file", + "longDescription": "The [target name](https://docs.getdbt.com/docs/core/connect-data-platform/profiles.yml) of your development environment as defined in your `profiles.yml` file. See the [Manifest Tables](/docs/modeling-your-data/modeling-your-data-with-dbt/package-mechanics/manifest-tables/) section for more details.", + "packageDefault": "dev", + "group": "Warehouse and Tracker" + }, + "snowplow__events_table": { + "recommendFullRefresh": true, + "order": 5, + "consoleGroup": "required", + "type": "string", + "title": "Events Table", + "description": "The name of the table that contains your atomic events", + "longDescription": "The name of the table that contains your atomic events.", + "packageDefault": "events", + "group": "Warehouse and Tracker" + }, + "snowplow__heartbeat": { + "recommendFullRefresh": true, + "order": 14, + "consoleGroup": "basic", + "type": "number", + "minimum": 0, + "title": "Heartbeat", + "description": "Page ping heartbeat time as defined in your tracker configuration", + "longDescription": "Page ping heartbeat time as defined in your [tracker configuration](/docs/collecting-data/collecting-from-own-applications/javascript-trackers/web-tracker/tracking-events/#activity-tracking-page-pings).", + "packageDefault": "10", + "group": "Warehouse and Tracker" + }, + "snowplow__min_visit_length": { + "recommendFullRefresh": true, + "order": 15, + "consoleGroup": "basic", + "type": "number", + "minimum": 0, + "title": "Min Visit length", + "description": "Minimum visit length as defined in your tracker configuration", + "longDescription": "Minimum visit length as defined in your [tracker configuration](/docs/collecting-data/collecting-from-own-applications/javascript-trackers/web-tracker/tracking-events/#activity-tracking-page-pings).", + "packageDefault": "5", + "group": "Warehouse and Tracker" + }, + "snowplow__sessions_table": { + "recommendFullRefresh": true, + "order": 123, + "consoleGroup": "advanced", + "type": "string", + "title": "Sessions Table", + "description": "The users module requires data from the derived sessions table. If you choose to disable the standard sessions table in favor of your own custom table, set this to reference your new table e.g. {{ ref(\"snowplow_unified_sessions_custom\") }}", + "group": "Warehouse and Tracker", + "longDescription": "The users module requires data from the derived sessions table. If you choose to disable the standard sessions table in favor of your own custom table, set this to reference your new table e.g. `{{ ref(\\'snowplow_unified_sessions_custom\\') }}`. Please see the [README](https://github.com/snowplow/dbt-snowplow-unified/tree/main/custom_example) in the `custom_example` directory for more information on this sort of implementation.", + "packageDefault": "\"{{ ref( \\'snowplow_unified_sessions\\' ) }}\"" + }, + "snowplow__allow_refresh": { + "recommendFullRefresh": true, + "order": 39, + "consoleGroup": "advanced", + "type": "boolean", + "title": "Allow Refresh", + "group": "Operation and Logic", + "longDescription": "Used as the default value to return from the `allow_refresh()` macro. This macro determines whether the manifest tables can be refreshed or not, depending on your environment. See the [Manifest Tables](/docs/modeling-your-data/modeling-your-data-with-dbt/package-mechanics/manifest-tables/) section for more details.", + "packageDefault": "false" + }, + "snowplow__backfill_limit_days": { + "recommendFullRefresh": false, + "order": 41, + "consoleGroup": "advanced", + "type": "number", + "minimum": 0, + "title": "Backfill Limit", + "group": "Operation and Logic", + "longDescription": "The maximum numbers of days of new data to be processed since the latest event processed. Please refer to the [incremental logic](/docs/modeling-your-data/modeling-your-data-with-dbt/package-mechanics/incremental-processing/#package-state) section for more details.", + "packageDefault": "30", + "description": "The maximum numbers of days of new data to be processed since the latest event processed" + }, + "snowplow__conversion_events": { + "recommendFullRefresh": false, + "order": 45, + "consoleGroup": "advanced", + "title": "Conversion Definition", + "group": "Operation and Logic", + "description": "> Click the plus sign to add a new entry", + "longDescription": "A list of dictionaries that define a conversion event for your modeling, to add the relevant columns to the sessions table. The dictionary keys are `name` (required), `condition` (required), `value`, `default_value`, and `list_events`. For more information see the [package documentation](/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-models/dbt-unified-data-model/conversions/).", + "packageDefault": "", + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "required": [ + "name", + "condition" + ], + "title": "", + "description": "Conversion Event", + "properties": { + "name": { + "type": "string", + "title": "Name", + "description": "Name of your conversion type", + "order": 1 + }, + "condition": { + "type": "string", + "title": "Condition", + "description": "SQL condition e.g. event_name = 'page_view'", + "format": "sql", + "order": 2 + }, + "value": { + "type": "string", + "title": "Value", + "description": "SQL value e.g. tr_total_base", + "format": "sql", + "order": 3 + }, + "default_value": { + "type": "number", + "title": "Default value", + "description": "Default value e.g. 0", + "order": 4 + }, + "list_events": { + "type": "boolean", + "title": "List all event ids?", + "order": 5 + } + } + }, + "uniqueItems": true + }, + "snowplow__cwv_days_to_measure": { + "recommendFullRefresh": true, + "order": 85, + "consoleGroup": "advanced", + "type": "number", + "minimum": 1, + "title": "CWV Days To Measure", + "group": "Operation and Logic", + "longDescription": "The number of days to use for web vital measurements (if enabled).", + "packageDefault": "28", + "description": "The number of days to use for web vital measurements (if enabled)" + }, + "snowplow__cwv_percentile": { + "recommendFullRefresh": true, + "order": 86, + "consoleGroup": "advanced", + "type": "number", + "minimum": 0, + "maximum": 100, + "title": "CWV Percentile", + "group": "Operation and Logic", + "longDescription": "The percentile that the web vitals measurements that are produced for all page views (if enabled).", + "packageDefault": "75", + "description": "The percentile that the web vitals measurements that are produced for all page views (if enabled)" + }, + "snowplow__days_late_allowed": { + "recommendFullRefresh": true, + "order": 42, + "consoleGroup": "advanced", + "type": "number", + "minimum": 0, + "title": "Days Late Allowed", + "group": "Operation and Logic", + "longDescription": "The maximum allowed number of days between the event creation and it being sent to the collector. Exists to reduce lengthy table scans that can occur as a result of late arriving data.", + "packageDefault": "3", + "description": "The maximum allowed number of days between the event creation and it being sent to the collector" + }, + "snowplow__lookback_window_hours": { + "recommendFullRefresh": false, + "order": 43, + "consoleGroup": "advanced", + "type": "number", + "minimum": 0, + "title": "Event Lookback Window", + "longDescription": "The number of hours to look before the latest event processed - to account for late arriving data, which comes out of order.", + "packageDefault": "6", + "group": "Operation and Logic", + "description": "The number of hours to look before the latest event processed - to account for late arriving data, which comes out of order" + }, + "snowplow__max_session_days": { + "recommendFullRefresh": true, + "order": 110, + "consoleGroup": "advanced", + "type": "number", + "minimum": 0, + "title": "Max Session Length", + "longDescription": "The maximum allowed session length in days. For a session exceeding this length, all events after this limit will stop being processed. Exists to reduce lengthy table scans that can occur due to long sessions which are usually a result of bots.", + "packageDefault": "3", + "group": "Operation and Logic", + "description": "The maximum allowed session length in days. For a session exceeding this length, all events after this limit will stop being processed" + }, + "snowplow__session_lookback_days": { + "recommendFullRefresh": false, + "order": 121, + "consoleGroup": "advanced", + "type": "number", + "minimum": 0, + "title": "Session Lookback Window", + "longDescription": "Number of days to limit scan on `snowplow_unified_base_sessions_lifecycle_manifest` manifest. Exists to improve performance of model when we have a lot of sessions. Should be set to as large a number as practical.", + "packageDefault": "730", + "group": "Operation and Logic", + "description": "Number of days to limit scan on `snowplow_unified_base_sessions_lifecycle_manifest` manifest" + }, + "snowplow__session_stitching": { + "recommendFullRefresh": false, + "order": 51, + "consoleGroup": "advanced", + "type": "boolean", + "title": "Enable Session Stitching", + "longDescription": "Determines whether to apply the user mapping to the sessions table. Please see the [User Mapping](/docs/modeling-your-data/modeling-your-data-with-dbt/package-features/identity-stitching/) section for more details.", + "packageDefault": "true", + "group": "Operation and Logic" + }, + "snowplow__start_date": { + "recommendFullRefresh": false, + "order": 6, + "consoleGroup": "required", + "type": "string", + "format": "date", + "title": "Start Date", + "group": "Operation and Logic", + "longDescription": "The date to start processing events from in the package on first run or a full refresh, based on `collector_tstamp`", + "packageDefault": "2020-01-01", + "description": "The date to start processing events from in the package on first run or a full refresh, based on `collector_tstamp`" + }, + "snowplow__total_all_conversions": { + "recommendFullRefresh": false, + "order": 44, + "consoleGroup": "advanced", + "type": "boolean", + "title": "Total All Conversions", + "longDescription": "A boolean flag whether to calculate and add the `cv__all_volume` and `cv__all_total` columns. For more information see the [package documentation](/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-models/dbt-unified-data-model/conversions/).", + "packageDefault": "false", + "group": "Operation and Logic" + }, + "snowplow__upsert_lookback_days": { + "recommendFullRefresh": false, + "order": 126, + "consoleGroup": "advanced", + "type": "number", + "minimum": 0, + "title": "Upsert Lookback Days", + "group": "Operation and Logic", + "longDescription": "Number of days to look back over the incremental derived tables during the upsert. Where performance is not a concern, should be set to as long a value as possible. Having too short a period can result in duplicates. Please see the [Snowplow Optimized Materialization](/docs/modeling-your-data/modeling-your-data-with-dbt/package-mechanics/optimized-upserts/) section for more details.", + "packageDefault": "30", + "description": "Number of days to look back over the incremental derived tables during the upsert" + }, + "snowplow__app_id": { + "recommendFullRefresh": false, + "order": 8, + "consoleGroup": "basic", + "type": "array", + "description": "> Click the plus sign to add a new entry", + "minItems": 0, + "title": "App IDs", + "longDescription": "A list of `app_id`s to filter the events table on for processing within the package.", + "packageDefault": "[ ] (no filter applied)", + "group": "Contexts, Filters, and Logs", + "items": { + "type": "string", + "title": "App ID" + } + }, + "snowplow__enable_consent": { + "recommendFullRefresh": false, + "order": 97, + "consoleGroup": "advanced", + "type": "boolean", + "group": "Contexts, Filters, and Logs", + "longDescription": "Flag to enable the [consent](/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-models/dbt-unified-data-model/consent-module/) module.", + "packageDefault": "false", + "title": "Enable Consent Module" + }, + "snowplow__enable_cwv": { + "recommendFullRefresh": false, + "order": 98, + "consoleGroup": "advanced", + "type": "boolean", + "group": "Contexts, Filters, and Logs", + "longDescription": "Flag to enable the [Core Web Vitals](/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-models/dbt-unified-data-model/core-web-vitals-module/) module.", + "packageDefault": "false", + "title": "Enable Core Web Vitals Module" + }, + "snowplow__enable_iab": { + "recommendFullRefresh": false, + "order": 25, + "consoleGroup": "basic", + "type": "boolean", + "group": "Contexts, Filters, and Logs", + "longDescription": "Flag to include the [IAB enrichment](/docs/enriching-your-data/available-enrichments/iab-enrichment/) data in the models.", + "packageDefault": "false", + "title": "Enable IAB" + }, + "snowplow__enable_ua": { + "recommendFullRefresh": false, + "order": 27, + "consoleGroup": "basic", + "type": "boolean", + "group": "Contexts, Filters, and Logs", + "longDescription": "Flag to include the [UA Parser enrichment](/docs/enriching-your-data/available-enrichments/ua-parser-enrichment/) data in the models.", + "packageDefault": "false", + "title": "Enable UA" + }, + "snowplow__enable_yauaa": { + "recommendFullRefresh": false, + "order": 26, + "consoleGroup": "basic", + "type": "boolean", + "group": "Contexts, Filters, and Logs", + "longDescription": "Flag to include the [YAUAA enrichment](/docs/enriching-your-data/available-enrichments/yauaa-enrichment/) data in the models.", + "packageDefault": "false", + "title": "Enable YAUAA" + }, + "snowplow__has_log_enabled": { + "recommendFullRefresh": false, + "order": 107, + "consoleGroup": "advanced", + "type": "boolean", + "group": "Contexts, Filters, and Logs", + "longDescription": "When executed, the package logs information about the current run to the CLI. This can be disabled by setting to `false`.", + "packageDefault": "true", + "title": "Enable Run Logs" + }, + "snowplow__ua_bot_filter": { + "recommendFullRefresh": true, + "order": 21, + "consoleGroup": "basic", + "type": "boolean", + "group": "Contexts, Filters, and Logs", + "longDescription": "Flag to filter out bots via the `useragent` string pattern match.", + "packageDefault": "true", + "title": "Filter Bots" + }, + "snowplow__databricks_catalog": { + "recommendFullRefresh": true, + "order": 2, + "consoleGroup": "required", + "type": "string", + "title": "(Databricks) Catalog", + "warehouse": "Databricks", + "group": "Warehouse Specific", + "longDescription": "The catalogue your atomic events table is in. Depending on the use case it should either be the catalog (for Unity Catalog users from databricks connector 1.1.1 onwards, defaulted to `hive_metastore`) or the same value as your `snowplow__atomic_schema` (unless changed it should be 'atomic').", + "packageDefault": "hive_metastore", + "description": "The catalogue your atomic events table is in" + }, + "snowplow__enable_load_tstamp": { + "recommendFullRefresh": false, + "order": 99, + "consoleGroup": "advanced", + "type": "boolean", + "warehouse": "Redshift", + "title": "(Redshift) Enable load_tstamp", + "longDescription": "Flag to include the `load_tstamp` column in the base events this run model. This should be set to true (the default) unless you are using the Postgres loader or an RDB loader version less than 4.0.0. It must be true to use consent models on Postgres and Redshift.", + "packageDefault": "true", + "group": "Warehouse Specific" + }, + "snowplow__derived_tstamp_partitioned": { + "recommendFullRefresh": false, + "order": 9, + "consoleGroup": "basic", + "type": "boolean", + "warehouse": "Bigquery", + "title": "(Bigquery) Derived Timestamp Partition", + "longDescription": "Boolean to enable filtering the events table on `derived_tstamp` in addition to `collector_tstamp`.", + "packageDefault": "true", + "group": "Warehouse Specific" + }, + "snowplow__ga4_categories_seed": { + "recommendFullRefresh": false, + "order": 58, + "consoleGroup": "advanced", + "type": "string", + "title": "Seed reference for GA4 Categories", + "longDescription": "Name of the model for the GA4 category mapping seed table, either a seed or a model (if you want to use a source, create a model to select from it).", + "packageDefault": "snowplow_unified_dim_ga4_source_categories", + "group": "Warehouse and Tracker" + }, + "snowplow__geo_mapping_seed": { + "recommendFullRefresh": false, + "order": 59, + "consoleGroup": "advanced", + "type": "string", + "title": "Seed reference for geo mapping", + "longDescription": "Name of the model for the Geo mapping seed table, either a seed or a model (if you want to use a source, create a model to select from it).", + "packageDefault": "snowplow_unified_dim_geo_country_mapping", + "group": "Warehouse and Tracker" + }, + "snowplow__rfc_5646_seed": { + "recommendFullRefresh": false, + "order": 57, + "consoleGroup": "advanced", + "type": "string", + "title": "Seed reference for rfc 5646 (language mapping)", + "longDescription": "Name of the model for the RFC 5646 (language) mapping seed table, either a seed or a model (if you want to use a source, create a model to select from it).", + "packageDefault": "snowplow_unified_dim_rfc_5646_language_mapping", + "group": "Warehouse and Tracker" + }, + "snowplow__session_identifiers": { + "recommendFullRefresh": true, + "order": 46, + "consoleGroup": "advanced", + "title": "Session Identifiers", + "group": "Operation and Logic", + "longDescription": "A list of key:value dictionaries which contain all of the contexts and fields where your session identifiers are located. For each entry in the list, if your map contains the `schema` value `atomic`, then this refers to a field found directly in the atomic `events` table. If you are trying to introduce a context/entity with an identifier in it, the package will look for the context in your events table with the name specified in the `schema` field. It will use the specified value in the `field` key as the field name to access. For Redshift/Postgres, using the `schema` key the package will try to find a table in your `snowplow__events_schema` schema with the same name as the `schema` value provided, and join that. If multiple fields are specified, the package will try to coalesce all fields in the order specified in the list. For a better understanding of the advanced usage of this variable, please see the [Custom Identifiers](/docs/modeling-your-data/modeling-your-data-with-dbt/package-features/custom-identifiers/) section for more details.", + "packageDefault": "[{\"schema\" : \"atomic\", \"field\" : \"domain_sessionid\"}]", + "type": "array", + "description": "> Click the plus sign to add a new entry", + "minItems": 0, + "items": { + "type": "object", + "title": "Identifier", + "properties": { + "schema": { + "type": "string", + "title": "(JSON) schema name for the field", + "order": 1, + "description": "The schema name of your events table, atomic in most use cases, alternatively for sdes/contexts this should instead be the name of the field itself" + }, + "field": { + "type": "string", + "order": 2, + "title": "Field name", + "description": "The name of the field to use as session identifier, alternatively, in case of sdes/contexts it is the name of the element that refers to the field to be extracted" + } + }, + "required": [ + "schema", + "field" + ], + "additionalProperties": false + }, + "uniqueItems": true + }, + "snowplow__session_sql": { + "recommendFullRefresh": true, + "order": 47, + "consoleGroup": "advanced", + "type": "string", + "format": "sql", + "title": "SQL for your session identifier", + "longDescription": "This allows you to override the `session_identifiers` SQL, to define completely custom SQL in order to build out a session identifier for your events. If you are interested in using this instead of providing identifiers through the `session_identifiers` variable, please see the [Custom Identifiers](/docs/modeling-your-data/modeling-your-data-with-dbt/package-features/custom-identifiers/) section for more details on how to do that.", + "packageDefault": "", + "group": "Operation and Logic" + }, + "snowplow__session_timestamp": { + "recommendFullRefresh": false, + "order": 55, + "consoleGroup": "advanced", + "type": "string", + "title": "Timestamp used for incremental processing, should be your partition field", + "group": "Operation and Logic", + "longDescription": "Determines which timestamp is used to build the sessionization logic. It's a good idea to have this timestamp be the same timestamp as the field you partition your events table on.", + "packageDefault": "collector_tstamp" + }, + "snowplow__user_identifiers": { + "recommendFullRefresh": true, + "order": 48, + "consoleGroup": "advanced", + "title": "User Identifiers", + "group": "Operation and Logic", + "longDescription": "A list of key:value dictionaries which contain all of the contexts and fields where your user identifiers are located. For each entry in the list, if your map contains the `schema` value `atomic`, then this refers to a field found directly in the atomic `events` table. If you are trying to introduce a context/entity with an identifier in it, the package will look for the context in your events table with the name specified in the `schema` field. It will use the specified value in the `field` key as the field name to access. For Redshift/Postgres, using the `schema` key the package will try to find a table in your `snowplow__events_schema` schema with the same name as the `schema` value provided, and join that. If multiple fields are specified, the package will try to coalesce all fields in the order specified in the list. For a better understanding of the advanced usage of this variable, please see the [Custom Identifiers](/docs/modeling-your-data/modeling-your-data-with-dbt/package-features/custom-identifiers/) section for more details.", + "packageDefault": "[{\"schema\" : \"atomic\", \"field\" : \"domain_userid\"}]", + "type": "array", + "description": "> Click the plus sign to add a new entry", + "minItems": 0, + "items": { + "type": "object", + "title": "Identifier", + "properties": { + "schema": { + "type": "string", + "title": "(JSON) schema name for the field", + "order": 1, + "description": "The schema name of your events table, atomic in most use cases, alternatively for sdes/contexts this should instead be the name of the field itself" + }, + "field": { + "type": "string", + "title": "Field name", + "order": 2, + "description": "The name of the field to use as user identifier, alternatively, in case of sdes/contexts it is the name of the element that refers to the field to be extracted" + } + }, + "required": [ + "schema", + "field" + ], + "additionalProperties": false + }, + "uniqueItems": true + }, + "snowplow__user_sql": { + "recommendFullRefresh": true, + "order": 49, + "consoleGroup": "advanced", + "type": "string", + "format": "sql", + "title": "SQL for your user identifier", + "longDescription": "This allows you to override the `user_identifiers` SQL, to define completely custom SQL in order to build out a user identifier for your events. If you are interested in using this instead of providing identifiers through the `user_identifiers` variable, please see the [Custom Identifiers](/docs/modeling-your-data/modeling-your-data-with-dbt/package-features/custom-identifiers/) section for more details on how to do that.", + "packageDefault": "", + "group": "Operation and Logic" + }, + "snowplow__user_stitching_id": { + "recommendFullRefresh": false, + "order": 50, + "consoleGroup": "advanced", + "type": "string", + "format": "sql", + "title": "Field used when stitching together users", + "longDescription": "This is the user_id you want to stitch to sessions (and/or page views) with matching domain_userids. It supports raw `sql` expressions.", + "packageDefault": "user_id", + "group": "Operation and Logic" + }, + "snowplow__view_passthroughs": { + "recommendFullRefresh": false, + "order": 132, + "consoleGroup": "advanced", + "title": "Page View Passthroughs", + "group": "Contexts, Filters, and Logs", + "longDescription": "Field(s) to carry through from the events table to the derived table. The field is from the `page/screen_view` event record. Aggregation is not supported. A list of either flat column names from the events table or a dictionary with the keys `sql` for the SQL code to select the column and `alias` for the alias of the column in the output. See the [Passthrough field](/docs/modeling-your-data/modeling-your-data-with-dbt/package-features/passthrough-fields/) docs for more information.", + "packageDefault": "[ ] (no passthroughs)", + "$ref": "#/definitions/passthrough_vars" + }, + "snowplow__session_passthroughs": { + "recommendFullRefresh": false, + "order": 122, + "consoleGroup": "advanced", + "title": "Session Passthroughs", + "group": "Contexts, Filters, and Logs", + "longDescription": "Field(s) to carry through from the events table to the derived table. The field is based on the first `page/screen_view` or `page_ping` event for that session. Aggregation is not supported. A list of either flat column names from the events table or a dictionary with the keys `sql` for the SQL code to select the column and `alias` for the alias of the column in the output.", + "packageDefault": "[ ] (no passthroughs)", + "$ref": "#/definitions/passthrough_vars" + }, + "snowplow__user_first_passthroughs": { + "recommendFullRefresh": false, + "order": 129, + "consoleGroup": "advanced", + "title": "User First Passthroughs", + "group": "Contexts, Filters, and Logs", + "longDescription": "Field(s) to carry through from the events table to the derived table. The field is based on the first session record for that user. Aggregation is not supported. A list of either flat column names from the sessions table or a dictionary with the keys `sql` for the SQL code to select the column and `alias` for the alias of the column in the output.", + "packageDefault": "[ ] (no passthroughs)", + "$ref": "#/definitions/passthrough_vars" + }, + "snowplow__user_last_passthroughs": { + "recommendFullRefresh": false, + "order": 130, + "consoleGroup": "advanced", + "title": "User Last Passthroughs", + "group": "Contexts, Filters, and Logs", + "longDescription": "Field(s) to carry through from the events table to the derived table. The field is based on the last session record for that user. Aggregation is not supported. A list of either flat column names from the sessions table or a dictionary with the keys `sql` for the SQL code to select the column and `alias` for the alias of the column in the output. See the [Passthrough field](/docs/modeling-your-data/modeling-your-data-with-dbt/package-features/passthrough-fields/) docs for more information. Note flat fields will be aliased with a `last_` prefix, dictionary provided aliases will not by default.", + "packageDefault": "[ ] (no passthroughs)", + "$ref": "#/definitions/passthrough_vars" + }, + "snowplow__conversion_passthroughs": { + "recommendFullRefresh": false, + "order": 75, + "consoleGroup": "advanced", + "title": "User Conversion Passthroughs", + "group": "Contexts, Filters, and Logs", + "longDescription": "Field(s) to carry through from the events table to the derived table. The field is based on the events_this_run table therefore taking all events. Aggregation is not supported. A list of either flat column names from the events table or a dictionary with the keys `sql` for the SQL code to select the column and `alias` for the alias of the column in the output. See the [Passthrough field](/docs/modeling-your-data/modeling-your-data-with-dbt/package-features/passthrough-fields/) docs for more information.Note flat fields will be aliased with a `last_` prefix, dictionary provided aliases will not by default.", + "packageDefault": "[ ] (no passthroughs)", + "$ref": "#/definitions/passthrough_vars" + }, + "snowplow__entities_or_sdes": { + "recommendFullRefresh": false, + "order": 104, + "consoleGroup": "advanced", + "title": "(Redshift) Entities or SDEs", + "longDescription": "A list of dictionaries defining the `entity` or `self-describing` event tables to join onto your base events table. Please use the tool below or see the section on [Utilizing custom contexts or SDEs](/docs/modeling-your-data/modeling-your-data-with-dbt/package-features/modeling-entities/) for details of the structure.", + "packageDefault": "[]", + "warehouse": "Redshift", + "group": "Warehouse Specific", + "type": "array", + "description": "> Click the plus sign to add a new entry", + "minItems": 0, + "items": { + "type": "object", + "title": "Entity or SDE", + "properties": { + "schema": { + "type": "string", + "title": "Table name", + "description": "Table name", + "order": 1 + }, + "prefix": { + "type": "string", + "title": "Column prefix", + "description": "Prefix to add to columns", + "order": 2 + }, + "alias": { + "type": "string", + "title": "CTE Alias", + "description": "Table alias for the subquery", + "order": 3 + }, + "single_entity": { + "type": "boolean", + "title": "Is single entity?", + "order": 4 + } + }, + "required": [ + "schema", + "prefix" + ], + "additionalProperties": false + }, + "uniqueItems": true + }, + "snowplow__view_stitching": { + "recommendFullRefresh": false, + "order": 53, + "consoleGroup": "advanced", + "type": "boolean", + "title": "Enable View Stitching", + "longDescription": "Determines whether to apply the user mapping to the views table. Note this can be an expensive operation to do every run. One way to mitigate this is by running this update with less frequency than your usual run by enabling this variable only for that specific run. Please see the [User Mapping](/docs/modeling-your-data/modeling-your-data-with-dbt/package-features/identity-stitching/) section for more details.", + "packageDefault": "false", + "group": "Operation and Logic" + }, + "snowplow__conversion_stitching": { + "recommendFullRefresh": true, + "order": 77, + "consoleGroup": "advanced", + "type": "boolean", + "title": "Enable Conversion Stitching", + "longDescription": "Determines whether to apply the user mapping to the conversions table. Please see the [User Mapping](/docs/modeling-your-data/modeling-your-data-with-dbt/package-features/identity-stitching/) section for more details.", + "packageDefault": "true", + "group": "Operation and Logic" + }, + "snowplow__list_event_counts": { + "recommendFullRefresh": false, + "order": 16, + "consoleGroup": "basic", + "type": "boolean", + "title": "List Per-Event Counts", + "longDescription": "A boolean whether to include a json-type (varies by warehouse) column in the sessions table with a count of events for each `event_type` in that session.", + "packageDefault": "false", + "group": "Operation and Logic" + }, + "snowplow__enable_mobile_context": { + "recommendFullRefresh": false, + "order": 100, + "consoleGroup": "advanced", + "type": "boolean", + "title": "Enable Mobile Context", + "longDescription": "Flag to include mobile context data in the models", + "packageDefault": "false", + "group": "Contexts, Filters, and Logs" + }, + "snowplow__enable_geolocation_context": { + "recommendFullRefresh": false, + "order": 24, + "consoleGroup": "basic", + "type": "boolean", + "title": "Enable Geolocation Context", + "longDescription": "Flag to include the geolocation data in the models.", + "packageDefault": "false", + "group": "Contexts, Filters, and Logs" + }, + "snowplow__enable_application_context": { + "recommendFullRefresh": false, + "order": 96, + "consoleGroup": "advanced", + "type": "boolean", + "title": "Enable App Context", + "longDescription": "Flag to include the app context data in the models.", + "packageDefault": "false", + "group": "Contexts, Filters, and Logs" + }, + "snowplow__enable_screen_context": { + "recommendFullRefresh": false, + "order": 102, + "consoleGroup": "advanced", + "type": "boolean", + "title": "Enable Screen Context", + "longDescription": "Flag to include the mobile screen data in the models.", + "packageDefault": "false", + "group": "Contexts, Filters, and Logs" + }, + "snowplow__enable_app_errors": { + "recommendFullRefresh": false, + "order": 94, + "consoleGroup": "advanced", + "type": "boolean", + "title": "Enable App Error Context", + "longDescription": "Flag to include the mobile app error data in the models.", + "packageDefault": "false", + "group": "Contexts, Filters, and Logs" + }, + "snowplow__enable_conversions": { + "recommendFullRefresh": false, + "order": 22, + "consoleGroup": "basic", + "type": "boolean", + "title": "Enable Conversions", + "longDescription": "Flag to enable the conversions optional module.", + "packageDefault": "false", + "group": "Contexts, Filters, and Logs" + }, + "snowplow__enable_deep_link_context": { + "recommendFullRefresh": false, + "order": 23, + "consoleGroup": "basic", + "type": "boolean", + "title": "Enable Deep Link Context", + "longDescription": "Flag to include the deep link context data in the models.", + "packageDefault": "false", + "group": "Contexts, Filters, and Logs" + }, + "snowplow__enable_mobile": { + "recommendFullRefresh": false, + "order": 12, + "consoleGroup": "required", + "type": "boolean", + "title": "Enable Mobile Data", + "longDescription": "Flag to process mobile events throughout the package.", + "packageDefault": "true", + "group": "Warehouse and Tracker" + }, + "snowplow__enable_web": { + "recommendFullRefresh": false, + "order": 10, + "consoleGroup": "required", + "type": "boolean", + "title": "Enable Web Data", + "longDescription": "Flag to process web events throughout the package.", + "packageDefault": "true", + "group": "Warehouse and Tracker" + }, + "snowplow__enable_screen_summary_context": { + "recommendFullRefresh": false, + "order": 103, + "consoleGroup": "advanced", + "type": "boolean", + "title": "Enable Screen Summary Context", + "longDescription": "Flag to process the screen engagement information in the screen summary context on mobile events.", + "packageDefault": "false", + "group": "Contexts, Filters, and Logs" + }, + "snowplow__custom_sql": { + "recommendFullRefresh": false, + "order": 84, + "consoleGroup": "advanced", + "type": "string", + "title": "Custom SQL", + "format": "sql", + "group": "Operation and Logic", + "longDescription": "This allows you to introduce custom sql to the `snowplow_unified_events_this_run` table, which you can then leverage in downstream models. For more information on the usage, see the following page on the [advanced usage of the utils package](/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-custom-models/examples/additional-sql-on-events-this-run/).", + "packageDefault": "", + "description": "Custom SQL for your events this run table." + }, + "snowplow__view_aggregations": { + "recommendFullRefresh": false, + "order": 131, + "consoleGroup": "advanced", + "title": "View Aggregations", + "group": "Contexts, Filters, and Logs", + "longDescription": "Aggregations to calculate as part of the the derived table. A list of dictionaries defining the type (sum, avg, min, max, count, countd), field to aggregate, and what to alias the column.", + "packageDefault": "[ ] (no aggregations)", + "$ref": "#/definitions/aggregation_vars" + }, + "snowplow__session_aggregations": { + "recommendFullRefresh": false, + "order": 120, + "consoleGroup": "advanced", + "title": "Session Aggregations", + "group": "Contexts, Filters, and Logs", + "longDescription": "Aggregations to calculate as part of the the derived table. A list of dictionaries defining the type (sum, avg, min, max, count, countd), field to aggregate, and what to alias the column.", + "packageDefault": "[ ] (no aggregations)", + "$ref": "#/definitions/aggregation_vars" + }, + "snowplow__user_aggregations": { + "recommendFullRefresh": false, + "order": 128, + "consoleGroup": "advanced", + "title": "User First Aggregations", + "group": "Contexts, Filters, and Logs", + "longDescription": " Aggregations to calculate as part of the the derived table. A list of dictionaries defining the type (sum, avg, min, max, count, countd), field to aggregate, and what to alias the column.", + "packageDefault": "[ ] (no aggregations)", + "$ref": "#/definitions/aggregation_vars" + }, + "snowplow__grant_select_to": { + "recommendFullRefresh": false, + "order": 106, + "consoleGroup": "advanced", + "type": "array", + "description": "> Click the plus sign to add a new entry", + "minItems": 0, + "items": { + "type": "string", + "title": "User/Role" + }, + "title": "Grant Select List", + "group": "Warehouse and Tracker", + "longDescription": "A list of users to grant select to all tables created by this package to.", + "packageDefault": "[]" + }, + "snowplow__grant_schema_usage": { + "recommendFullRefresh": false, + "order": 105, + "consoleGroup": "advanced", + "type": "boolean", + "description": "Enable granting usage on schemas", + "title": "Enable grant usage", + "group": "Warehouse and Tracker", + "longDescription": "Enables granting usage on schemas interacted with on a dbt run", + "packageDefault": "true" + }, + "snowplow__use_refr_if_mkt_null": { + "recommendFullRefresh": false, + "order": 133, + "consoleGroup": "advanced", + "type": "boolean", + "description": "Use refr fields when mkt fields are null for default channel group", + "title": "Use Refr if Mkt is null", + "group": "Operation and Logic", + "longDescription": "Use the refr fields when the mkt fields are null for the definition of default channel group. This is a common case when for example a landing page has a redirect.", + "packageDefault": "false" + }, + "snowplow__enable_initial_checks": { + "recommendFullRefresh": false, + "order": 134, + "consoleGroup": "advanced", + "type": "boolean", + "description": "Use this field to enable initial checks when changing the package configuration to ensure quick fails if the configuration is incorrect", + "title": "Enable Initial Checks", + "group": "Operation and Logic", + "longDescription": "Use this field to test the existance of all the seed models in the dwh and the existence of contexts that are needed based on your configuration. This is useful when changing the package configuration to ensure quick fails if the configuration is incorrect.", + "packageDefault": "false" + } + } +} From 5202e0aaf7de385be37e6a51eeb9a7c8cdd7816d Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Tue, 25 Jun 2024 12:52:00 +0100 Subject: [PATCH 2/3] Add new var and page on lake loaders --- .../dbt-operation/lakes/index.md | 30 +++++++++++++++++++ .../Schemas/dbtUnified_0.4.4.json | 12 ++++++++ 2 files changed, 42 insertions(+) create mode 100644 docs/modeling-your-data/modeling-your-data-with-dbt/dbt-operation/lakes/index.md diff --git a/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-operation/lakes/index.md b/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-operation/lakes/index.md new file mode 100644 index 0000000000..b703b31842 --- /dev/null +++ b/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-operation/lakes/index.md @@ -0,0 +1,30 @@ +--- +title: "Running the models on data lakehouses" +sidebar_position: 50 +description: "How to run our models on lakehouses" +--- + +:::danger + +Running the models on data lakes or lakehouses (using external tables in a warehouse to read directly from a lake) is currently in private preview state and is not fully supported. Certain features may not work as expected and errors are more likely to occur. Please use this approach at your own risk and raise any issues you find with us. + +::: + +If you are using the [lake loaders](/docs/storing-querying/storage-options/index.md#data-lake-loaders) to load your data into a lake storage option, it may be possible to use our data models. In general in this section of the docs we are not going to detail which warehouses support which file formats, or how to set up the respective tables in each warehouse - please see the docs for your appropriate warehouse to see what file formats they support. + +# Databricks +At time of writing, `delta` is the preferred file format for Databricks [external tables](https://docs.databricks.com/en/sql/language-manual/sql-ref-external-tables.html). If you create an external table from this lake format in Databricks, you should be able to run the models without any further changes required by simply pointing the model at this table. + +# Snowflake +At time of writing, `Iceberg` is the preferred file format for Snowflake [iceberg tables](https://docs.snowflake.com/en/user-guide/tables-iceberg). If you wish to use our models with this, currently only the [Unified Digital](/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-models/dbt-unified-data-model/index.md) package supports this, by setting the `snowplow__snowflake_lakeloader` variable to `true`. + +Note that compared to the other loaders for Snowflake, that field names in Self-describing events and Entities are converted to `snake_case` format (the other loaders retain the format used in the schema, often `camelCase`). You will need to adjust other variables and inputs accordingly compared to what you may find in the docs. + +# Spark +Currently using spark directly as a compute engine is not supported for our packages. + +# Redshift (spectrum) +Currently using Redshift Spectrum tables is not supported for our packages due to [limitations](https://docs.aws.amazon.com/redshift/latest/dg/nested-data-restrictions.html) with the platform. + +# BigQuery on GCS +Currently using GCS/BigQuery external tables is not tested but may work, please let us know your experience if you try this. diff --git a/src/components/JsonSchemaValidator/Schemas/dbtUnified_0.4.4.json b/src/components/JsonSchemaValidator/Schemas/dbtUnified_0.4.4.json index f53ecd3a09..d38abe0112 100644 --- a/src/components/JsonSchemaValidator/Schemas/dbtUnified_0.4.4.json +++ b/src/components/JsonSchemaValidator/Schemas/dbtUnified_0.4.4.json @@ -935,6 +935,18 @@ "group": "Operation and Logic", "longDescription": "Use this field to test the existance of all the seed models in the dwh and the existence of contexts that are needed based on your configuration. This is useful when changing the package configuration to ensure quick fails if the configuration is incorrect.", "packageDefault": "false" + }, + "snowplow__snowflake_lakeloader": { + "recommendFullRefresh": true, + "order": 134, + "consoleGroup": "required", + "type": "boolean", + "description": "Enable running the models on an iceberg table using lakeloader data", + "title": "(Snowflake) Using lakeloader?", + "group": "Warehouse Specific", + "longDescription": "To use the models on an iceberg events table created via the lakeloader on Snowflake, this will need to be enabled due to different table structures.", + "packageDefault": "false", + "warehouse": "Snowflake" } } } From b00e0be6c5fe2e3efdb40927956390abb7fa776e Mon Sep 17 00:00:00 2001 From: Ryan Hill Date: Tue, 25 Jun 2024 12:56:47 +0100 Subject: [PATCH 3/3] Add early release badge --- .../dbt-operation/lakes/index.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-operation/lakes/index.md b/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-operation/lakes/index.md index b703b31842..f1c7c7428c 100644 --- a/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-operation/lakes/index.md +++ b/docs/modeling-your-data/modeling-your-data-with-dbt/dbt-operation/lakes/index.md @@ -4,9 +4,15 @@ sidebar_position: 50 description: "How to run our models on lakehouses" --- +```mdx-code-block +import Badges from '@site/src/components/Badges'; +``` + +  + :::danger -Running the models on data lakes or lakehouses (using external tables in a warehouse to read directly from a lake) is currently in private preview state and is not fully supported. Certain features may not work as expected and errors are more likely to occur. Please use this approach at your own risk and raise any issues you find with us. +Running the models on data lakes or lakehouses (using external tables in a warehouse to read directly from a lake) is currently in Early Release state and is not fully supported. Certain features may not work as expected and errors are more likely to occur. Please use this approach at your own risk and raise any issues you find with us. :::