How to fine-tune LLM with LoRA in python
Direct answer
Use the
peft library in Python to apply LoRA adapters on a pretrained LLM, then fine-tune it with standard training loops or Hugging Face Trainer APIs.Setup
Install
pip install transformers datasets peft accelerate Env vars
HF_HOMETRANSFORMERS_CACHE Imports
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
default_peft_config = None
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training Examples
inFine-tune 'gpt2' on a small custom dataset with LoRA
outModel loaded with LoRA adapters, trained for 3 epochs, and saved locally.
inApply LoRA to 'facebook/opt-350m' and fine-tune on a text dataset
outLoRA adapters added, training run with Hugging Face Trainer, final model saved.
inTry fine-tuning a quantized model with LoRA
outModel prepared for int8 training, LoRA applied, fine-tuning completed successfully.
Integration steps
- Install required libraries: transformers, peft, datasets, accelerate
- Load a pretrained LLM and tokenizer from Hugging Face
- Configure LoRA parameters with
LoraConfig - Wrap the model with LoRA adapters using
get_peft_model - Prepare the model for efficient training (e.g., int8 if quantized)
- Create a dataset and define training arguments
- Use Hugging Face
Trainerto fine-tune the LoRA-augmented model - Save the fine-tuned model with LoRA adapters
Full code
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training
# Load pretrained model and tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Prepare model for int8 training if needed (optional)
model = prepare_model_for_int8_training(model)
# Define LoRA configuration
lora_config = LoraConfig(
r=8, # rank
lora_alpha=32, # scaling factor
target_modules=["c_attn"], # modules to apply LoRA
lora_dropout=0.1,
bias="none",
task_type="CAUSAL_LM"
)
# Apply LoRA to the model
model = get_peft_model(model, lora_config)
# Load a small dataset for fine-tuning
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")
# Tokenize dataset
def tokenize_function(examples):
return tokenizer(examples["text"], truncation=True, max_length=128)
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
# Define training arguments
training_args = TrainingArguments(
output_dir="./lora-finetuned-gpt2",
per_device_train_batch_size=8,
num_train_epochs=3,
logging_steps=10,
save_steps=50,
save_total_limit=2,
evaluation_strategy="no",
learning_rate=3e-4,
weight_decay=0.01,
fp16=torch.cuda.is_available(),
push_to_hub=False
)
# Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset
)
# Fine-tune the model
trainer.train()
# Save the LoRA fine-tuned model
model.save_pretrained("./lora-finetuned-gpt2")
tokenizer.save_pretrained("./lora-finetuned-gpt2")
print("LoRA fine-tuning complete and model saved.") output
LoRA fine-tuning complete and model saved.
API trace
Request
{"model": "gpt2", "train_dataset": "tokenized_dataset", "training_args": {"num_train_epochs": 3, "per_device_train_batch_size": 8, ...}, "peft": {"lora_config": {"r": 8, "lora_alpha": 32, "target_modules": ["c_attn"]}}} Response
{"training_state": {"epoch": 3, "global_step": 150}, "model": "LoRA fine-tuned model weights", "metrics": {"loss": 1.23}} Extract
Use the saved model directory './lora-finetuned-gpt2' for inference or further use.Variants
Streaming fine-tuning with LoRA ›
Use when you want to monitor training progress live with TensorBoard or similar tools.
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model = prepare_model_for_int8_training(model)
lora_config = LoraConfig(r=8, lora_alpha=32, target_modules=["c_attn"], lora_dropout=0.1, bias="none", task_type="CAUSAL_LM")
model = get_peft_model(model, lora_config)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")
def tokenize_function(examples):
return tokenizer(examples["text"], truncation=True, max_length=128)
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
training_args = TrainingArguments(
output_dir="./lora-finetuned-gpt2-stream",
per_device_train_batch_size=8,
num_train_epochs=3,
logging_steps=10,
save_steps=50,
save_total_limit=2,
evaluation_strategy="no",
learning_rate=3e-4,
weight_decay=0.01,
fp16=torch.cuda.is_available(),
push_to_hub=False,
report_to="tensorboard"
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset
)
trainer.train()
model.save_pretrained("./lora-finetuned-gpt2-stream")
tokenizer.save_pretrained("./lora-finetuned-gpt2-stream")
print("Streaming LoRA fine-tuning complete.") Async fine-tuning with LoRA ›
Use when integrating fine-tuning into an async application or pipeline.
import asyncio
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training
async def fine_tune_lora():
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model = prepare_model_for_int8_training(model)
lora_config = LoraConfig(r=8, lora_alpha=32, target_modules=["c_attn"], lora_dropout=0.1, bias="none", task_type="CAUSAL_LM")
model = get_peft_model(model, lora_config)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")
def tokenize_function(examples):
return tokenizer(examples["text"], truncation=True, max_length=128)
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
training_args = TrainingArguments(
output_dir="./lora-finetuned-gpt2-async",
per_device_train_batch_size=8,
num_train_epochs=3,
logging_steps=10,
save_steps=50,
save_total_limit=2,
evaluation_strategy="no",
learning_rate=3e-4,
weight_decay=0.01,
fp16=torch.cuda.is_available(),
push_to_hub=False
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset
)
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, trainer.train)
model.save_pretrained("./lora-finetuned-gpt2-async")
tokenizer.save_pretrained("./lora-finetuned-gpt2-async")
print("Async LoRA fine-tuning complete.")
asyncio.run(fine_tune_lora()) Fine-tuning with an alternative model (OPT 350M) ›
Use when you want to fine-tune a different architecture like OPT with LoRA.
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training
model_name = "facebook/opt-350m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model = prepare_model_for_int8_training(model)
lora_config = LoraConfig(r=8, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.1, bias="none", task_type="CAUSAL_LM")
model = get_peft_model(model, lora_config)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")
def tokenize_function(examples):
return tokenizer(examples["text"], truncation=True, max_length=128)
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
training_args = TrainingArguments(
output_dir="./lora-finetuned-opt-350m",
per_device_train_batch_size=8,
num_train_epochs=3,
logging_steps=10,
save_steps=50,
save_total_limit=2,
evaluation_strategy="no",
learning_rate=3e-4,
weight_decay=0.01,
fp16=torch.cuda.is_available(),
push_to_hub=False
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset
)
trainer.train()
model.save_pretrained("./lora-finetuned-opt-350m")
tokenizer.save_pretrained("./lora-finetuned-opt-350m")
print("LoRA fine-tuning on OPT 350M complete.") Performance
Latency~5-15 minutes per epoch on a single GPU for small datasets
Cost~$0.50-$2.00 per fine-tuning run on a single GPU instance
Rate limitsDepends on Hugging Face Hub or cloud provider limits, typically no strict API rate limits for local training
- Use smaller max_length to reduce tokenization overhead
- Limit dataset size during prototyping to speed up training
- Use mixed precision (fp16) to reduce memory and increase speed
| Approach | Latency | Cost/call | Best for |
|---|---|---|---|
| Standard LoRA fine-tuning | ~10 min/epoch | ~$1 per run | Efficient fine-tuning on moderate hardware |
| Streaming LoRA fine-tuning | ~10 min/epoch | ~$1 per run | Live monitoring and logging during training |
| Async LoRA fine-tuning | ~10 min/epoch | ~$1 per run | Integration in async workflows or pipelines |
Quick tip
Always freeze the base model weights and only train LoRA adapters to save memory and speed up fine-tuning.
Common mistake
Forgetting to prepare the model for int8 or 4-bit training before applying LoRA causes training failures or poor performance.