How to fine-tune with QLoRA in Python
Direct answer
Use the Hugging Face Transformers and PEFT libraries with
BitsAndBytesConfig for 4-bit quantization and LoraConfig to apply QLoRA fine-tuning in Python.Setup
Install
pip install transformers accelerate bitsandbytes peft datasets Env vars
HF_HOMETRANSFORMERS_CACHE Imports
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
import torch
import os Examples
inFine-tune 'meta-llama/Llama-2-7b-chat-hf' on a small custom dataset with QLoRA
outModel loaded with 4-bit quantization, LoRA adapters applied, training started with low VRAM usage.
inApply QLoRA fine-tuning on 'meta-llama/Llama-3.1-8B-Instruct' using Hugging Face Trainer
outEfficient fine-tuning with LoRA adapters and 4-bit quantization, training completes with reduced memory footprint.
inAttempt QLoRA fine-tuning without BitsAndBytesConfig
outFails due to missing 4-bit quantization setup, high memory usage, or out-of-memory errors.
Integration steps
- Install required libraries: transformers, accelerate, bitsandbytes, peft, datasets
- Load the pretrained model with 4-bit quantization using BitsAndBytesConfig
- Configure LoRA adapters with LoraConfig targeting causal LM modules
- Wrap the model with get_peft_model to enable LoRA fine-tuning
- Prepare dataset and tokenizer for training
- Use Hugging Face Trainer with TrainingArguments to fine-tune the model
Full code
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
import torch
# Load tokenizer
model_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Setup 4-bit quantization config
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16
)
# Load model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map="auto"
)
# Configure LoRA for QLoRA fine-tuning
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none",
task_type=TaskType.CAUSAL_LM
)
# Apply LoRA adapters
model = get_peft_model(model, lora_config)
# Load a small dataset for fine-tuning
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")
# Tokenize dataset
def tokenize_function(examples):
return tokenizer(examples["text"], truncation=True, max_length=512)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
# Training arguments
training_args = TrainingArguments(
output_dir="./qlora-finetuned",
per_device_train_batch_size=4,
gradient_accumulation_steps=8,
num_train_epochs=3,
learning_rate=3e-4,
fp16=True,
logging_steps=10,
save_steps=100,
save_total_limit=2,
evaluation_strategy="no",
report_to=[]
)
# Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset
)
# Start fine-tuning
trainer.train()
print("QLoRA fine-tuning completed.") output
QLoRA fine-tuning completed.
API trace
Request
{"model_name_or_path": "meta-llama/Llama-2-7b-chat-hf", "quantization_config": {"load_in_4bit": true, "bnb_4bit_use_double_quant": true, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": "float16"}, "device_map": "auto"} Response
{"model": "AutoModelForCausalLM instance with LoRA adapters applied", "training": {"epochs": 3, "batch_size": 4, "learning_rate": 3e-4}} Extract
Use the Trainer API's train() method output and monitor logs for training progressVariants
Streaming fine-tuning logs ›
Use when you want real-time training logs streamed to TensorBoard or console for monitoring.
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
import torch
model_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type=TaskType.CAUSAL_LM)
model = get_peft_model(model, lora_config)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")
def tokenize_function(examples):
return tokenizer(examples["text"], truncation=True, max_length=512)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
training_args = TrainingArguments(
output_dir="./qlora-finetuned",
per_device_train_batch_size=4,
gradient_accumulation_steps=8,
num_train_epochs=3,
learning_rate=3e-4,
fp16=True,
logging_steps=1,
save_steps=100,
save_total_limit=2,
evaluation_strategy="no",
report_to=["tensorboard"]
)
trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset)
trainer.train()
print("Streaming QLoRA fine-tuning completed.") Async fine-tuning with Accelerate ›
Use for distributed or mixed precision training with better resource management and concurrency.
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
import torch
from accelerate import Accelerator
accelerator = Accelerator()
model_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map=None)
lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type=TaskType.CAUSAL_LM)
model = get_peft_model(model, lora_config)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")
def tokenize_function(examples):
return tokenizer(examples["text"], truncation=True, max_length=512)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
train_dataloader = torch.utils.data.DataLoader(tokenized_dataset, batch_size=4, shuffle=True)
model, train_dataloader = accelerator.prepare(model, train_dataloader)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
model.train()
for epoch in range(3):
for batch in train_dataloader:
outputs = model(**batch)
loss = outputs.loss
accelerator.backward(loss)
optimizer.step()
optimizer.zero_grad()
print(f"Epoch {epoch + 1} completed.")
print("Async QLoRA fine-tuning completed.") Alternative model: Llama 3.1 8B Instruct ›
Use this model for instruction-tuned tasks requiring a smaller but powerful base model.
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
import torch
model_name = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type=TaskType.CAUSAL_LM)
model = get_peft_model(model, lora_config)
# Dataset and training steps same as previous examples
print("Ready to fine-tune Llama 3.1 8B with QLoRA.") Performance
Latency~5-15 minutes per epoch on a single 40GB GPU for 7B-8B parameter models
Cost~$0.10-$0.50 per hour on cloud GPUs with 4-bit quantization reducing memory and compute
Rate limitsDepends on cloud GPU provider; no API rate limits for local fine-tuning
- Use smaller batch sizes with gradient accumulation to fit in GPU memory
- Limit sequence length to reduce token processing cost
- Use mixed precision (fp16) to speed up training
| Approach | Latency | Cost/call | Best for |
|---|---|---|---|
| Standard fine-tuning (no quantization) | Long (hours) | High | Highest accuracy but requires large GPU memory |
| QLoRA fine-tuning (4-bit + LoRA) | Moderate (minutes per epoch) | Low | Efficient fine-tuning on limited GPU memory |
| Streaming logs with Trainer | Same as QLoRA | Low | Real-time monitoring during training |
| Async fine-tuning with Accelerate | Same as QLoRA | Low | Distributed or multi-GPU training |
Quick tip
Always use <code>BitsAndBytesConfig</code> with <code>LoraConfig</code> to enable efficient 4-bit QLoRA fine-tuning and reduce GPU memory usage.
Common mistake
Forgetting to set <code>load_in_4bit=True</code> in <code>BitsAndBytesConfig</code> causes out-of-memory errors during QLoRA fine-tuning.