Lineage-graph-accelerator / samples /airflow_dag_sample.json
aamanlamba's picture
Phase 2: Enhanced lineage extraction with export to data catalogs
0510038
{
"dag_id": "ecommerce_etl_pipeline",
"description": "Daily ETL pipeline for e-commerce data warehouse",
"schedule_interval": "0 2 * * *",
"start_date": "2025-01-01",
"catchup": false,
"tags": ["etl", "ecommerce", "daily"],
"default_args": {
"owner": "data_engineering",
"retries": 3,
"retry_delay_minutes": 5,
"email_on_failure": true
},
"tasks": [
{
"task_id": "extract_customers",
"operator": "PythonOperator",
"description": "Extract customer data from source database",
"upstream_dependencies": [],
"downstream_dependencies": ["transform_customers"],
"source": "postgres://source_db/customers",
"target": "s3://data-lake/raw/customers/"
},
{
"task_id": "extract_orders",
"operator": "PythonOperator",
"description": "Extract orders data from source database",
"upstream_dependencies": [],
"downstream_dependencies": ["transform_orders"],
"source": "postgres://source_db/orders",
"target": "s3://data-lake/raw/orders/"
},
{
"task_id": "extract_products",
"operator": "PythonOperator",
"description": "Extract products data from source database",
"upstream_dependencies": [],
"downstream_dependencies": ["transform_products"],
"source": "postgres://source_db/products",
"target": "s3://data-lake/raw/products/"
},
{
"task_id": "extract_order_items",
"operator": "PythonOperator",
"description": "Extract order items from source database",
"upstream_dependencies": [],
"downstream_dependencies": ["transform_order_items"],
"source": "postgres://source_db/order_items",
"target": "s3://data-lake/raw/order_items/"
},
{
"task_id": "transform_customers",
"operator": "SparkSubmitOperator",
"description": "Clean and transform customer data",
"upstream_dependencies": ["extract_customers"],
"downstream_dependencies": ["load_dim_customers"],
"source": "s3://data-lake/raw/customers/",
"target": "s3://data-lake/transformed/customers/"
},
{
"task_id": "transform_orders",
"operator": "SparkSubmitOperator",
"description": "Clean and transform orders data",
"upstream_dependencies": ["extract_orders"],
"downstream_dependencies": ["load_fct_orders"],
"source": "s3://data-lake/raw/orders/",
"target": "s3://data-lake/transformed/orders/"
},
{
"task_id": "transform_products",
"operator": "SparkSubmitOperator",
"description": "Clean and transform products data",
"upstream_dependencies": ["extract_products"],
"downstream_dependencies": ["load_dim_products"],
"source": "s3://data-lake/raw/products/",
"target": "s3://data-lake/transformed/products/"
},
{
"task_id": "transform_order_items",
"operator": "SparkSubmitOperator",
"description": "Clean and transform order items data",
"upstream_dependencies": ["extract_order_items"],
"downstream_dependencies": ["load_fct_orders"],
"source": "s3://data-lake/raw/order_items/",
"target": "s3://data-lake/transformed/order_items/"
},
{
"task_id": "load_dim_customers",
"operator": "SnowflakeOperator",
"description": "Load customer dimension to Snowflake",
"upstream_dependencies": ["transform_customers"],
"downstream_dependencies": ["build_customer_metrics"],
"source": "s3://data-lake/transformed/customers/",
"target": "snowflake://warehouse/analytics.dim_customers"
},
{
"task_id": "load_dim_products",
"operator": "SnowflakeOperator",
"description": "Load product dimension to Snowflake",
"upstream_dependencies": ["transform_products"],
"downstream_dependencies": ["build_sales_report"],
"source": "s3://data-lake/transformed/products/",
"target": "snowflake://warehouse/analytics.dim_products"
},
{
"task_id": "load_fct_orders",
"operator": "SnowflakeOperator",
"description": "Load orders fact table to Snowflake",
"upstream_dependencies": ["transform_orders", "transform_order_items"],
"downstream_dependencies": ["build_customer_metrics", "build_sales_report"],
"source": ["s3://data-lake/transformed/orders/", "s3://data-lake/transformed/order_items/"],
"target": "snowflake://warehouse/analytics.fct_orders"
},
{
"task_id": "build_customer_metrics",
"operator": "SnowflakeOperator",
"description": "Calculate customer lifetime value and metrics",
"upstream_dependencies": ["load_dim_customers", "load_fct_orders"],
"downstream_dependencies": ["publish_to_bi"],
"source": ["analytics.dim_customers", "analytics.fct_orders"],
"target": "snowflake://warehouse/analytics.rpt_customer_metrics"
},
{
"task_id": "build_sales_report",
"operator": "SnowflakeOperator",
"description": "Build daily sales report",
"upstream_dependencies": ["load_dim_products", "load_fct_orders"],
"downstream_dependencies": ["publish_to_bi"],
"source": ["analytics.dim_products", "analytics.fct_orders"],
"target": "snowflake://warehouse/analytics.rpt_daily_sales"
},
{
"task_id": "publish_to_bi",
"operator": "PythonOperator",
"description": "Publish reports to BI tool",
"upstream_dependencies": ["build_customer_metrics", "build_sales_report"],
"downstream_dependencies": ["notify_stakeholders"],
"source": ["analytics.rpt_customer_metrics", "analytics.rpt_daily_sales"],
"target": "tableau://server/ecommerce_dashboard"
},
{
"task_id": "notify_stakeholders",
"operator": "EmailOperator",
"description": "Send completion notification",
"upstream_dependencies": ["publish_to_bi"],
"downstream_dependencies": []
}
],
"notes": "Sample Airflow DAG representing a complete ETL pipeline with extract, transform, load, and reporting stages."
}