🗄️ Schéma Data Warehouse étoile — SQL
Format : SQL · PostgreSQL + Snowflake + BigQuery Auteur : Équipe pédagogique ITAG · Data Pack Mise à jour : 2026
🎯 Description
Schéma star (modélisation Kimball) prêt à déployer pour data warehouse. Tables de faits + dimensions, partitioning, indexes, vues matérialisées. Compatible PostgreSQL, Snowflake, BigQuery, Redshift.
📋 Contenu (4 fichiers SQL)
1. 01_dimensions.sql
dim_date(avec hiérarchie année / trimestre / mois / semaine / jour)dim_customer(SCD Type 2 avec valid_from / valid_to)dim_product(avec catégories hiérarchiques)dim_geography(pays / région / ville)dim_channel(online / offline / partenaire)
2. 02_facts.sql
fact_sales(partitionnée par date, FK vers toutes les dimensions)fact_inventory(snapshot quotidien)fact_web_events(granulaire, pour funnel analysis)
3. 03_indexes.sql
- Indexes B-tree sur clés de jointure
- Indexes BRIN pour très gros volumes (PostgreSQL)
- Clustering keys (Snowflake)
4. 04_views.sql
vw_sales_daily— agrégation pré-calculéevw_customer_360— vue dénormaliséevw_cohort_retention— analyse rétention
💾 Schéma SQL complet
Dimension — dim_customer (SCD Type 2)
CREATE TABLE dim_customer (
customer_key SERIAL PRIMARY KEY,
customer_id VARCHAR(50) NOT NULL, -- Clé naturelle (source)
name VARCHAR(200) NOT NULL,
email VARCHAR(200),
country VARCHAR(100),
city VARCHAR(100),
segment VARCHAR(50), -- B2B / B2C / SME / Enterprise
created_at DATE,
valid_from DATE NOT NULL DEFAULT CURRENT_DATE,
valid_to DATE DEFAULT '9999-12-31',
is_current BOOLEAN DEFAULT TRUE,
_etl_loaded TIMESTAMP DEFAULT NOW()
);
INSERT INTO dim_customer (customer_id, name, email, country, city, segment, created_at) VALUES
('C001', 'Acme Corp', 'contact@acme.com', 'France', 'Paris', 'Enterprise', '2022-01-15'),
('C002', 'Dupont SARL', 'info@dupont.fr', 'France', 'Lyon', 'SME', '2022-03-20'),
('C003', 'TechStart Ltd', 'hello@techstart.io', 'Canada', 'Montréal', 'B2B', '2022-06-01'),
('C004', 'Marie Curie', 'marie@example.com', 'Belgique', 'Bruxelles', 'B2C', '2023-01-10'),
('C005', 'Global Trade SA', 'trading@global.com', 'Sénégal', 'Dakar', 'Enterprise', '2023-02-14'),
('C006', 'InnovateCo', 'ceo@innovate.co', 'Maroc', 'Casablanca', 'SME', '2023-05-22'),
('C007', 'Jean Lefebvre', 'jean.l@mail.fr', 'France', 'Toulouse', 'B2C', '2023-08-30'),
('C008', 'DataPlex Inc', 'ops@dataplex.ca', 'Canada', 'Toronto', 'Enterprise', '2024-01-05'),
('C009', 'Soleil RCI', 'admin@soleil-ci.com', 'Côte d''Ivoire', 'Abidjan', 'B2B', '2024-03-11'),
('C010', 'Atlas Logistic', 'logistics@atlas.ma', 'Maroc', 'Rabat', 'SME', '2024-07-19');
Dimension — dim_product
CREATE TABLE dim_product (
product_key SERIAL PRIMARY KEY,
product_id VARCHAR(50) NOT NULL,
name VARCHAR(200) NOT NULL,
category VARCHAR(100),
subcategory VARCHAR(100),
brand VARCHAR(100),
price NUMERIC(12,2),
cost NUMERIC(12,2),
margin_pct NUMERIC(5,2) GENERATED ALWAYS AS
(ROUND((price - cost) / NULLIF(price, 0) * 100, 2)) STORED,
is_active BOOLEAN DEFAULT TRUE,
_etl_loaded TIMESTAMP DEFAULT NOW()
);
INSERT INTO dim_product (product_id, name, category, subcategory, brand, price, cost) VALUES
('P001', 'Formation SQL Expert', 'Formation', 'Data', 'ITAG', 299.00, 45.00),
('P002', 'Certif. Cloud AWS', 'Certification','Infrastructure','ITAG', 499.00, 80.00),
('P003', 'Pack Data Engineer', 'Pack', 'Data', 'ITAG', 799.00, 120.00),
('P004', 'Abonnement Premium 1 an', 'Abonnement', 'Accès', 'ITAG', 199.00, 20.00),
('P005', 'Formation Python', 'Formation', 'Dev', 'ITAG', 249.00, 38.00),
('P006', 'Certif. PMP', 'Certification','Management', 'ITAG', 399.00, 65.00),
('P007', 'Formation Excel Avancé', 'Formation', 'Business', 'ITAG', 149.00, 22.00),
('P008', 'Pack Langue DELF B2', 'Pack', 'Langue', 'ITAG', 349.00, 55.00),
('P009', 'Abonnement Équipe 5 users','Abonnement', 'Accès', 'ITAG', 799.00, 60.00),
('P010', 'Formation NCLEX-RN', 'Formation', 'Santé', 'ITAG', 549.00, 90.00);
Dimension — dim_date
CREATE TABLE dim_date (
date_key INT PRIMARY KEY, -- Format YYYYMMDD : 20240101
full_date DATE NOT NULL,
year SMALLINT,
quarter SMALLINT,
month SMALLINT,
month_name VARCHAR(20),
week SMALLINT,
day_of_week SMALLINT, -- 0=Dimanche, 6=Samedi
day_name VARCHAR(20),
is_weekend BOOLEAN DEFAULT FALSE,
is_holiday BOOLEAN DEFAULT FALSE
);
INSERT INTO dim_date (date_key, full_date, year, quarter, month, month_name, week, day_of_week, day_name, is_weekend) VALUES
(20240101, '2024-01-01', 2024, 1, 1, 'Janvier', 1, 1, 'Lundi', FALSE),
(20240201, '2024-02-01', 2024, 1, 2, 'Février', 5, 4, 'Jeudi', FALSE),
(20240301, '2024-03-01', 2024, 1, 3, 'Mars', 9, 5, 'Vendredi',FALSE),
(20240401, '2024-04-01', 2024, 2, 4, 'Avril', 14, 1, 'Lundi', FALSE),
(20240501, '2024-05-01', 2024, 2, 5, 'Mai', 18, 3, 'Mercredi',FALSE),
(20240601, '2024-06-01', 2024, 2, 6, 'Juin', 22, 6, 'Samedi', TRUE),
(20240701, '2024-07-01', 2024, 3, 7, 'Juillet', 27, 1, 'Lundi', FALSE),
(20240801, '2024-08-01', 2024, 3, 8, 'Août', 31, 4, 'Jeudi', FALSE),
(20240901, '2024-09-01', 2024, 3, 9, 'Septembre',35, 0, 'Dimanche',TRUE),
(20241001, '2024-10-01', 2024, 4, 10, 'Octobre', 40, 2, 'Mardi', FALSE);
Dimension — dim_geography
CREATE TABLE dim_geography (
geo_key SERIAL PRIMARY KEY,
country_code CHAR(2) NOT NULL, -- ISO 3166-1 alpha-2
country VARCHAR(100) NOT NULL,
region VARCHAR(100),
city VARCHAR(100),
timezone VARCHAR(50)
);
INSERT INTO dim_geography (country_code, country, region, city, timezone) VALUES
('FR', 'France', 'Europe de l''Ouest', 'Paris', 'Europe/Paris'),
('FR', 'France', 'Europe de l''Ouest', 'Lyon', 'Europe/Paris'),
('FR', 'France', 'Europe de l''Ouest', 'Toulouse', 'Europe/Paris'),
('CA', 'Canada', 'Amérique du Nord', 'Montréal', 'America/Montreal'),
('CA', 'Canada', 'Amérique du Nord', 'Toronto', 'America/Toronto'),
('BE', 'Belgique', 'Europe de l''Ouest', 'Bruxelles', 'Europe/Brussels'),
('SN', 'Sénégal', 'Afrique de l''Ouest','Dakar', 'Africa/Dakar'),
('MA', 'Maroc', 'Afrique du Nord', 'Casablanca', 'Africa/Casablanca'),
('MA', 'Maroc', 'Afrique du Nord', 'Rabat', 'Africa/Casablanca'),
('CI', 'Côte d''Ivoire','Afrique de l''Ouest','Abidjan', 'Africa/Abidjan');
Table de faits — fact_sales
CREATE TABLE fact_sales (
sale_id BIGSERIAL PRIMARY KEY,
date_key INT NOT NULL REFERENCES dim_date(date_key),
customer_key INT NOT NULL REFERENCES dim_customer(customer_key),
product_key INT NOT NULL REFERENCES dim_product(product_key),
geo_key INT NOT NULL REFERENCES dim_geography(geo_key),
quantity INT NOT NULL DEFAULT 1,
unit_price NUMERIC(12,2) NOT NULL,
revenue NUMERIC(12,2) NOT NULL, -- quantity × unit_price
cost NUMERIC(12,2),
margin NUMERIC(12,2), -- revenue - cost
discount_pct NUMERIC(5,2) DEFAULT 0,
channel VARCHAR(50), -- 'web', 'mobile', 'partner'
_etl_loaded TIMESTAMP DEFAULT NOW()
) PARTITION BY RANGE (date_key);
-- Partitions par année
CREATE TABLE fact_sales_2023 PARTITION OF fact_sales
FOR VALUES FROM (20230101) TO (20231231);
CREATE TABLE fact_sales_2024 PARTITION OF fact_sales
FOR VALUES FROM (20240101) TO (20241231);
INSERT INTO fact_sales (date_key, customer_key, product_key, geo_key, quantity, unit_price, revenue, cost, margin, discount_pct, channel) VALUES
(20240101, 1, 1, 1, 1, 299.00, 299.00, 45.00, 254.00, 0, 'web'),
(20240201, 2, 4, 2, 2, 199.00, 398.00, 40.00, 358.00, 0, 'web'),
(20240301, 3, 2, 4, 1, 499.00, 499.00, 80.00, 419.00, 0, 'mobile'),
(20240401, 4, 5, 6, 1, 249.00, 249.00, 38.00, 211.00, 10.0, 'web'),
(20240501, 5, 3, 7, 1, 799.00, 799.00, 120.00, 679.00, 0, 'partner'),
(20240601, 6, 6, 8, 1, 399.00, 399.00, 65.00, 334.00, 5.0, 'web'),
(20240701, 7, 7, 3, 3, 149.00, 447.00, 66.00, 381.00, 0, 'mobile'),
(20240801, 8, 9, 5, 1, 799.00, 799.00, 60.00, 739.00, 0, 'partner'),
(20240901, 9, 10, 10, 1, 549.00, 549.00, 90.00, 459.00, 15.0, 'web'),
(20241001, 10, 8, 9, 1, 349.00, 349.00, 55.00, 294.00, 0, 'web');
Table de faits — fact_events (Web analytics)
CREATE TABLE fact_events (
event_id BIGSERIAL PRIMARY KEY,
date_key INT NOT NULL REFERENCES dim_date(date_key),
customer_key INT REFERENCES dim_customer(customer_key),
event_type VARCHAR(50) NOT NULL, -- 'page_view', 'click', 'purchase', 'signup'
page VARCHAR(200),
session_id VARCHAR(100),
device VARCHAR(30), -- 'desktop', 'mobile', 'tablet'
duration_sec INT,
_etl_loaded TIMESTAMP DEFAULT NOW()
);
INSERT INTO fact_events (date_key, customer_key, event_type, page, session_id, device, duration_sec) VALUES
(20240101, 1, 'page_view', '/cours/sql-avance', 'sess_a1b2', 'desktop', 245),
(20240101, 1, 'click', '/cours/sql-avance#acheter','sess_a1b2', 'desktop', 5),
(20240101, 1, 'purchase', '/checkout', 'sess_a1b2', 'desktop', 120),
(20240201, 2, 'page_view', '/simulateurs/delf-b2', 'sess_c3d4', 'mobile', 180),
(20240201, 2, 'signup', '/inscription', 'sess_c3d4', 'mobile', 90),
(20240301, NULL, 'page_view', '/blog/sql-window', 'sess_e5f6', 'desktop', 420),
(20240301, 3, 'page_view', '/certifications/aws', 'sess_g7h8', 'tablet', 310),
(20240401, 4, 'page_view', '/cours/python', 'sess_i9j0', 'mobile', 195),
(20240501, 5, 'page_view', '/parcours/data', 'sess_k1l2', 'desktop', 560),
(20240601, NULL, 'page_view', '/accueil', 'sess_m3n4', 'mobile', 45);
Index — Optimisation des jointures
-- Indexes sur les clés étrangères de fact_sales
CREATE INDEX idx_fact_sales_date_key ON fact_sales (date_key);
CREATE INDEX idx_fact_sales_customer_key ON fact_sales (customer_key);
CREATE INDEX idx_fact_sales_product_key ON fact_sales (product_key);
CREATE INDEX idx_fact_sales_geo_key ON fact_sales (geo_key);
CREATE INDEX idx_fact_sales_channel ON fact_sales (channel);
-- Index composite pour les requêtes analytiques courantes
CREATE INDEX idx_fact_sales_date_product ON fact_sales (date_key, product_key);
CREATE INDEX idx_fact_sales_date_customer ON fact_sales (date_key, customer_key);
-- Index sur fact_events
CREATE INDEX idx_fact_events_date_key ON fact_events (date_key);
CREATE INDEX idx_fact_events_customer_key ON fact_events (customer_key);
CREATE INDEX idx_fact_events_session_id ON fact_events (session_id);
CREATE INDEX idx_fact_events_event_type ON fact_events (event_type);
-- Index BRIN pour très gros volumes (colonnes monotones comme date_key)
-- (PostgreSQL uniquement — plus léger qu'un B-tree sur partitions larges)
CREATE INDEX idx_fact_sales_brin ON fact_sales USING BRIN (date_key);
Vues — vw_customer_360
CREATE OR REPLACE VIEW vw_customer_360 AS
SELECT
c.customer_id,
c.name AS customer_name,
c.segment,
c.country,
COUNT(DISTINCT fs.sale_id) AS total_orders,
SUM(fs.revenue) AS lifetime_revenue,
AVG(fs.revenue) AS avg_order_value,
MAX(dd.full_date) AS last_purchase_date,
MIN(dd.full_date) AS first_purchase_date,
COUNT(DISTINCT fs.product_key) AS distinct_products_bought,
-- Segmentation RFM simplifiée
CASE
WHEN SUM(fs.revenue) > 1000 AND COUNT(*) > 3 THEN 'Champion'
WHEN SUM(fs.revenue) > 500 THEN 'Loyal'
WHEN MAX(dd.full_date) > CURRENT_DATE - 90 THEN 'Récent'
ELSE 'À risque'
END AS rfm_segment
FROM dim_customer c
LEFT JOIN fact_sales fs ON c.customer_key = fs.customer_key
LEFT JOIN dim_date dd ON fs.date_key = dd.date_key
WHERE c.is_current = TRUE
GROUP BY c.customer_id, c.name, c.segment, c.country;
Vue — vw_cohort_retention
CREATE OR REPLACE VIEW vw_cohort_retention AS
WITH first_purchase AS (
SELECT
customer_key,
MIN(date_key) AS cohort_date_key,
DATE_TRUNC('month',
(SELECT full_date FROM dim_date WHERE date_key = MIN(fs.date_key))
) AS cohort_month
FROM fact_sales fs
GROUP BY customer_key
),
purchases_by_month AS (
SELECT
fs.customer_key,
DATE_TRUNC('month', dd.full_date) AS purchase_month
FROM fact_sales fs
JOIN dim_date dd ON fs.date_key = dd.date_key
GROUP BY fs.customer_key, DATE_TRUNC('month', dd.full_date)
),
cohort_data AS (
SELECT
fp.cohort_month,
pbm.purchase_month,
EXTRACT(MONTH FROM AGE(pbm.purchase_month, fp.cohort_month))::INT AS months_since_first,
COUNT(DISTINCT fp.customer_key) AS cohort_size,
COUNT(DISTINCT pbm.customer_key) AS active_customers
FROM first_purchase fp
JOIN purchases_by_month pbm ON fp.customer_key = pbm.customer_key
GROUP BY fp.cohort_month, pbm.purchase_month,
EXTRACT(MONTH FROM AGE(pbm.purchase_month, fp.cohort_month))
)
SELECT
cohort_month,
months_since_first,
cohort_size,
active_customers,
ROUND(active_customers::NUMERIC / NULLIF(cohort_size, 0) * 100, 1) AS retention_pct
FROM cohort_data
ORDER BY cohort_month, months_since_first;
Vue — vw_monthly_revenue
CREATE OR REPLACE VIEW vw_monthly_revenue AS
SELECT
dd.year,
dd.month,
dd.month_name,
p.category AS product_category,
g.country,
COUNT(DISTINCT fs.sale_id) AS orders,
COUNT(DISTINCT fs.customer_key) AS unique_customers,
SUM(fs.revenue) AS revenue,
SUM(fs.margin) AS gross_margin,
ROUND(SUM(fs.margin) / NULLIF(SUM(fs.revenue), 0) * 100, 1) AS margin_pct,
-- YTD cumulé via window function
SUM(SUM(fs.revenue)) OVER (
PARTITION BY dd.year, p.category
ORDER BY dd.month
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) AS revenue_ytd
FROM fact_sales fs
JOIN dim_date dd ON fs.date_key = dd.date_key
JOIN dim_product p ON fs.product_key = p.product_key
JOIN dim_geography g ON fs.geo_key = g.geo_key
GROUP BY dd.year, dd.month, dd.month_name, p.category, g.country
ORDER BY dd.year, dd.month;
💼 Cas d'usage
- Démarrer une plateforme analytics from scratch
- Migration legacy DWH vers Snowflake/BigQuery
- Pratique pour interview Data Engineer
- Architecture lakehouse / dbt project
📥 Télécharger ce template
Télécharger le fichier .md · ← Retour catalogue · Parcours Data →
ITAG · Non-affiliés Snowflake / Google / AWS / dbt Labs.