Domain-specific fine-tuning for healthcare AI improved diagnostic accuracy by 25% while reducing inference costs by 40% through optimized model architecture
A leading healthcare system serving 2.5 million patients annually needed an AI solution to assist physicians in diagnosing rare diseases from medical imaging and clinical notes. Existing general-purpose medical AI models were producing insufficient accuracy for rare conditions, while commercial solutions were prohibitively expensive for large-scale deployment.
The key challenges included:
We developed a custom fine-tuning approach that combined multiple pre-trained models with domain-specific data to create a highly accurate, cost-effective diagnostic assistant.
Started with BioBERT for clinical text analysis and ResNet-50 pre-trained on medical images as foundation models.
Fine-tuned on 500K+ general medical records to adapt to healthcare system's specific terminology and practices.
Applied advanced techniques like focal loss and class weighting to improve performance on rare conditions with limited data.
Used knowledge distillation and pruning to reduce model size by 60% while maintaining accuracy for faster inference.
# Custom fine-tuning for rare disease classification
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import DataLoader, WeightedRandomSampler
import numpy as np
class MedicalDiagnosisModel(nn.Module):
def __init__(self, model_name, num_classes, dropout_rate=0.3):
super().__init__()
self.bert = AutoModel.from_pretrained(model_name)
self.dropout = nn.Dropout(dropout_rate)
self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
self.focal_loss = FocalLoss(alpha=1, gamma=2)
def forward(self, input_ids, attention_mask, labels=None):
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
pooled_output = outputs.pooler_output
output = self.dropout(pooled_output)
logits = self.classifier(output)
if labels is not None:
loss = self.focal_loss(logits, labels)
return {'loss': loss, 'logits': logits}
return {'logits': logits}
class FocalLoss(nn.Module):
def __init__(self, alpha=1, gamma=2):
super().__init__()
self.alpha = alpha
self.gamma = gamma
self.ce_loss = nn.CrossEntropyLoss(reduction='none')
def forward(self, inputs, targets):
ce_loss = self.ce_loss(inputs, targets)
pt = torch.exp(-ce_loss)
focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
return focal_loss.mean()
def create_class_weights(dataset):
"""Create balanced weights for rare disease classes"""
class_counts = np.bincount(dataset.labels)
total_samples = len(dataset.labels)
class_weights = total_samples / (len(class_counts) * class_counts)
return torch.FloatTensor(class_weights)
def fine_tune_model(model, train_loader, val_loader, num_epochs=10, lr=2e-5):
"""Fine-tuning training loop with advanced techniques"""
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
best_val_accuracy = 0
patience = 3
patience_counter = 0
for epoch in range(num_epochs):
# Training phase
model.train()
total_loss = 0
for batch in train_loader:
optimizer.zero_grad()
outputs = model(
input_ids=batch['input_ids'].to(device),
attention_mask=batch['attention_mask'].to(device),
labels=batch['labels'].to(device)
)
loss = outputs['loss']
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
total_loss += loss.item()
# Validation phase
model.eval()
val_accuracy = evaluate_model(model, val_loader)
# Early stopping
if val_accuracy > best_val_accuracy:
best_val_accuracy = val_accuracy
patience_counter = 0
torch.save(model.state_dict(), 'best_model.pt')
else:
patience_counter += 1
if patience_counter >= patience:
print(f"Early stopping at epoch {epoch}")
break
scheduler.step()
print(f"Epoch {epoch}: Loss={total_loss/len(train_loader):.4f}, Val Acc={val_accuracy:.4f}")
return model
# Knowledge distillation for model compression
import torch.nn.functional as F
from torch.nn import KLDivLoss
class ModelDistillation:
def __init__(self, teacher_model, student_model, temperature=3.0, alpha=0.7):
self.teacher = teacher_model
self.student = student_model
self.temperature = temperature
self.alpha = alpha # Weight for distillation loss
self.kl_loss = KLDivLoss(reduction='batchmean')
self.ce_loss = nn.CrossEntropyLoss()
def distillation_loss(self, student_logits, teacher_logits, labels):
"""Calculate combined distillation and hard target loss"""
# Soft targets from teacher
teacher_probs = F.softmax(teacher_logits / self.temperature, dim=1)
student_log_probs = F.log_softmax(student_logits / self.temperature, dim=1)
distillation_loss = self.kl_loss(student_log_probs, teacher_probs) * (self.temperature ** 2)
# Hard targets
student_loss = self.ce_loss(student_logits, labels)
# Combined loss
total_loss = self.alpha * distillation_loss + (1 - self.alpha) * student_loss
return total_loss
def train_student(self, train_loader, val_loader, num_epochs=20):
"""Train student model using knowledge distillation"""
self.teacher.eval() # Teacher in eval mode
optimizer = torch.optim.AdamW(self.student.parameters(), lr=3e-5)
for epoch in range(num_epochs):
self.student.train()
total_loss = 0
for batch in train_loader:
optimizer.zero_grad()
# Teacher predictions (no gradients)
with torch.no_grad():
teacher_outputs = self.teacher(
input_ids=batch['input_ids'].to(device),
attention_mask=batch['attention_mask'].to(device)
)
teacher_logits = teacher_outputs['logits']
# Student predictions
student_outputs = self.student(
input_ids=batch['input_ids'].to(device),
attention_mask=batch['attention_mask'].to(device)
)
student_logits = student_outputs['logits']
# Calculate distillation loss
loss = self.distillation_loss(
student_logits, teacher_logits, batch['labels'].to(device)
)
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Distillation Epoch {epoch}: Loss={total_loss/len(train_loader):.4f}")
return self.student
# Model quantization for further optimization
def quantize_model(model, calibration_loader):
"""Apply dynamic quantization to reduce model size"""
import torch.quantization as quantization
# Prepare model for quantization
model.eval()
model.qconfig = quantization.get_default_qconfig('fbgemm')
quantization.prepare(model, inplace=True)
# Calibrate with sample data
with torch.no_grad():
for batch in calibration_loader:
model(batch['input_ids'], batch['attention_mask'])
# Convert to quantized model
quantized_model = quantization.convert(model, inplace=False)
return quantized_model
Metric | General Medical AI | Our Fine-Tuned Model | Improvement |
---|---|---|---|
Overall Accuracy | 68.2% | 85.3% | +25.1% |
Rare Disease F1-Score | 0.52 | 0.78 | +50% |
Inference Time (ms) | 850 | 125 | -85.3% |
Model Size (MB) | 1,200 | 480 | -60% |
Annual Cost per Physician | $50,000 | $12,000 | -76% |
Memory Usage (GB) | 8.5 | 3.2 | -62.4% |