Vous allez construire de A à Z un modèle prédisant le loyer mensuel d'un appartement à Yaoundé/Douala, exposé via une API et un dashboard. Portfolio garanti pour décrocher un job data.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv('camrent.csv')
print(df.shape) # (5000, 12)
print(df.describe())
# Distribution du loyer (skewed à droite → log)
sns.histplot(df['loyer'], bins=50, kde=True)
df['loyer_log'] = np.log1p(df['loyer'])
# Corrélations
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
# Loyer par quartier (top 10)
top = df.groupby('quartier')['loyer'].median().nlargest(10)
sns.barplot(x=top.values, y=top.index)
# Ratios métier
df['surface_par_chambre'] = df['surface'] / df['chambres'].clip(lower=1)
df['densite_pieces'] = (df['chambres'] + df['sdb']) / df['surface']
# Distance centre-ville (lat/lon → km)
from math import radians, cos, sin, asin, sqrt
def haversine(lat1, lon1, lat2, lon2):
lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
dlat = lat2 - lat1
dlon = lon2 - lon1
a = sin(dlat/2)**2 + cos(lat1)*cos(lat2)*sin(dlon/2)**2
return 6371 * 2 * asin(sqrt(a))
centre_yde = (3.8480, 11.5021)
df['dist_centre'] = df.apply(
lambda r: haversine(r['lat'], r['lon'], *centre_yde), axis=1
)
# Encodage quartier (target encoding plutôt que OHE pour éviter explosion dimensions)
quartier_loyer = df.groupby('quartier')['loyer'].median()
df['quartier_score'] = df['quartier'].map(quartier_loyer)
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBRegressor
import numpy as np
X = df.drop(columns=['loyer', 'loyer_log'])
y = df['loyer_log']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Baseline
lr = LinearRegression().fit(X_train, y_train)
pred_lr = np.expm1(lr.predict(X_test))
print('LR RMSE :', np.sqrt(((pred_lr - np.expm1(y_test))**2).mean()))
# XGBoost
xgb = XGBRegressor(
n_estimators=500, learning_rate=0.05,
max_depth=6, subsample=0.8, colsample_bytree=0.8,
random_state=42
)
xgb.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
pred_xgb = np.expm1(xgb.predict(X_test))
print('XGB RMSE :', np.sqrt(((pred_xgb - np.expm1(y_test))**2).mean()))
# Hold-out + 5-fold CV
scores = cross_val_score(xgb, X, y, cv=5, scoring='neg_root_mean_squared_error')
print(f'CV RMSE : {-scores.mean():.0f} ± {scores.std():.0f}')
# Résidus pour détecter biais
residus = np.expm1(y_test) - pred_xgb
sns.scatterplot(x=pred_xgb, y=residus)
plt.axhline(0, color='red')
# api.py
from fastapi import FastAPI
from pydantic import BaseModel
import joblib, numpy as np
app = FastAPI(title='CamRent Predictor')
model = joblib.load('xgb_loyer.joblib')
class Bien(BaseModel):
surface: float
chambres: int
sdb: int
quartier_score: float
dist_centre: float
@app.post('/predict')
def predict(b: Bien):
X = np.array([[b.surface, b.chambres, b.sdb, b.quartier_score, b.dist_centre]])
loyer = float(np.expm1(model.predict(X)[0]))
return {'loyer_estime_fcfa': round(loyer, -3)}
# Lancer : uvicorn api:app --reload
# app.py
import streamlit as st
import joblib, numpy as np
st.title('🏙️ CamRent — Estimer un loyer')
model = joblib.load('xgb_loyer.joblib')
surface = st.slider('Surface (m²)', 20, 300, 80)
chambres = st.slider('Chambres', 1, 6, 3)
sdb = st.slider('Salles de bain', 1, 4, 2)
quartier = st.selectbox('Quartier', ['Bastos', 'Bonapriso', 'Mendong', 'Akwa'])
dist = st.number_input('Distance centre (km)', 0.0, 30.0, 5.0)
scores = {'Bastos': 450000, 'Bonapriso': 380000, 'Mendong': 150000, 'Akwa': 220000}
if st.button('Estimer'):
X = np.array([[surface, chambres, sdb, scores[quartier], dist]])
loyer = float(np.expm1(model.predict(X)[0]))
st.success(f'Loyer estimé : {loyer:,.0f} FCFA / mois')
# Lancer : streamlit run app.py