End-to-end MLOps infrastructure for financial services firm achieved 99.9% uptime and reduced deployment time from weeks to hours with automated monitoring and CI/CD
A leading financial services firm with $100B+ in assets under management was struggling with their machine learning deployment process. Their data science team had developed numerous high-performing models for risk assessment, fraud detection, and algorithmic trading, but deploying these models to production was a manual, error-prone process that took weeks or even months.
The key challenges included:
We designed and implemented a comprehensive MLOps pipeline that automated the entire machine learning lifecycle from development to production monitoring.
# GitLab CI/CD pipeline for ML model deployment
stages:
- data-validation
- model-training
- model-validation
- staging-deployment
- integration-tests
- production-deployment
- monitoring-setup
model-training:
stage: model-training
image: python:3.9
script:
- pip install -r requirements.txt
- python -m mlflow run . --experiment-name ${CI_COMMIT_REF_NAME}
- mlflow models build-docker -m "models:/${MODEL_NAME}/${MODEL_VERSION}" -n ${MODEL_NAME}:${MODEL_VERSION}
artifacts:
paths:
- model_artifacts/
expire_in: 30 days
only:
- develop
- master
staging-deployment:
stage: staging-deployment
image: bitnami/kubectl:latest
script:
- kubectl apply -f k8s/staging/
- kubectl set image deployment/${MODEL_NAME}-staging ${MODEL_NAME}=${MODEL_NAME}:${MODEL_VERSION}
- kubectl rollout status deployment/${MODEL_NAME}-staging
environment:
name: staging
url: https://staging-api.company.com
only:
- develop
production-deployment:
stage: production-deployment
image: bitnami/kubectl:latest
script:
- kubectl apply -f k8s/production/
- kubectl patch deployment ${MODEL_NAME} -p '{"spec":{"template":{"spec":{"containers":[{"name":"${MODEL_NAME}","image":"${MODEL_NAME}:${MODEL_VERSION}"}]}}}}'
- kubectl rollout status deployment/${MODEL_NAME}
environment:
name: production
url: https://api.company.com
when: manual
only:
- master
# Automated model monitoring with Evidently AI
import evidently
from evidently import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset
import pandas as pd
import logging
class ModelMonitor:
def __init__(self, reference_data, model_name):
self.reference_data = reference_data
self.model_name = model_name
self.logger = logging.getLogger(__name__)
def detect_drift(self, current_data, threshold=0.1):
"""Detect data and target drift in production data"""
# Configure column mapping
column_mapping = ColumnMapping(
target='target',
prediction='prediction',
numerical_features=['feature1', 'feature2', 'feature3'],
categorical_features=['category1', 'category2']
)
# Create drift report
report = Report(metrics=[
DataDriftPreset(),
TargetDriftPreset()
])
report.run(
reference_data=self.reference_data,
current_data=current_data,
column_mapping=column_mapping
)
# Extract drift metrics
drift_results = report.as_dict()
# Check for significant drift
data_drift_score = drift_results['metrics'][0]['result']['dataset_drift']
target_drift_score = drift_results['metrics'][1]['result']['drift_score']
if data_drift_score > threshold or target_drift_score > threshold:
self.trigger_retraining_alert(data_drift_score, target_drift_score)
return {
'data_drift': data_drift_score,
'target_drift': target_drift_score,
'drift_detected': data_drift_score > threshold or target_drift_score > threshold
}
def trigger_retraining_alert(self, data_drift, target_drift):
"""Trigger alert for model retraining"""
alert_message = f"""
Model drift detected for {self.model_name}:
- Data drift score: {data_drift:.3f}
- Target drift score: {target_drift:.3f}
Recommend immediate model retraining and validation.
"""
# Send to Slack/PagerDuty/etc.
self.send_alert(alert_message)
# Log the event
self.logger.warning(f"Drift detected for {self.model_name}: data={data_drift}, target={target_drift}")
def calculate_model_performance(self, y_true, y_pred, y_proba=None):
"""Calculate comprehensive model performance metrics"""
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
metrics = {
'accuracy': accuracy_score(y_true, y_pred),
'precision': precision_score(y_true, y_pred, average='weighted'),
'recall': recall_score(y_true, y_pred, average='weighted'),
'f1_score': f1_score(y_true, y_pred, average='weighted')
}
if y_proba is not None:
metrics['auc_roc'] = roc_auc_score(y_true, y_proba[:, 1])
return metrics
# Kubeflow pipeline for automated model retraining
from kfp import dsl, components
from kfp.components import create_component_from_func
@create_component_from_func
def data_validation_op(data_path: str, validation_rules_path: str) -> bool:
"""Validate data quality before retraining"""
import great_expectations as ge
import pandas as pd
# Load data and validation rules
df = pd.read_parquet(data_path)
context = ge.get_context()
# Run validation
validator = context.get_validator(df)
results = validator.validate()
return results.success
@create_component_from_func
def model_training_op(data_path: str, model_config: dict) -> str:
"""Train new model version"""
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
# Load training data
df = pd.read_parquet(data_path)
X = df.drop('target', axis=1)
y = df['target']
# Start MLflow run
with mlflow.start_run():
# Train model
model = RandomForestClassifier(**model_config)
model.fit(X, y)
# Log model
model_uri = mlflow.sklearn.log_model(model, "model").model_uri
return model_uri
@dsl.pipeline(name='automated-retraining-pipeline')
def retraining_pipeline(
data_path: str,
model_config: dict,
validation_rules_path: str
):
"""Complete automated retraining pipeline"""
# Step 1: Validate data quality
validation_task = data_validation_op(data_path, validation_rules_path)
# Step 2: Train new model (conditional on validation success)
with dsl.Condition(validation_task.output == True):
training_task = model_training_op(data_path, model_config)
# Step 3: Model validation and A/B testing would follow...