Zero Shot Learning Cheat Sheet — Classify Without Training
Use LLM knowledge to classify new text without task-specific training.
Like asking a human who speaks English to classify Spanish text by describing what the categories mean: they don't need Spanish training data, they infer meaning from language understanding and your description.
Key Concepts
Zero Shot Learning Comparison
| Approach | Training Data Required | Speed | Accuracy | When to Use |
|---|---|---|---|---|
| Zero-Shot | None (only class descriptions) | Instant (no retraining) | 60–80% (depends on clarity) | Rapid prototyping, unknown classes, exploration |
| Few-Shot | 3–10 labeled examples per class | Instant (prompt-based) | 75–90% (improved description) | Limited labels available, emerging use cases |
| Fine-Tuning | Hundreds/thousands of labeled examples | Slow (hours/days retraining) | 90–99% (task-optimized) | High accuracy required, stable taxonomy, production deployment |
| Rule-Based/Regex | None (manual rules) | Instant | 40–70% (brittle) | Simple patterns only, legacy systems |
Zero Shot Learning Patterns
from openai import OpenAI
import json
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
def classify_text(text, candidate_labels):
"""Zero-shot classification using LLM."""
prompt = f"""You are a classifier. Classify the text into ONE of these categories: {', '.join(candidate_labels)}.
Text: {text}
Respond ONLY with the chosen category name, nothing else."""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
temperature=0
)
return response.choices[0].message.content.strip()
# Usage
labels = ["urgent", "routine", "spam"]
result = classify_text("Fix the database immediately!", labels)
print(result) # Output: "urgent" urgent from openai import OpenAI
import json
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
def classify_with_scores(text, candidate_labels):
"""Zero-shot classification with confidence per label."""
prompt = f"""Classify the text into ONE of these categories and explain your reasoning.
Categories: {', '.join(candidate_labels)}
Text: {text}
Respond in JSON format:
{{
"chosen_label": "<one of the categories>",
"confidence": <0.0-1.0>,
"reasoning": "<brief explanation>"
}}"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
temperature=0
)
result = json.loads(response.choices[0].message.content.strip())
return result
# Usage
labels = ["positive", "negative", "neutral"]
result = classify_with_scores("This product is amazing!", labels)
print(result)
# Output: {"chosen_label": "positive", "confidence": 0.95, "reasoning": "..."} {"chosen_label": "positive", "confidence": 0.95, "reasoning": "Strong positive language..."} from transformers import pipeline
# Zero-shot classification via entailment (no fine-tuning required)
classifier = pipeline(
"zero-shot-classification",
model="facebook/bart-large-mnli"
)
text = "The government announced new climate policies."
candidate_labels = ["politics", "sports", "weather", "entertainment"]
result = classifier(text, candidate_labels, hypothesis_template="This text is about {}")
print(f"Top label: {result['labels'][0]}")
print(f"Scores: {dict(zip(result['labels'], result['scores']))}")
# Output:
# Top label: politics
# Scores: {'politics': 0.95, 'weather': 0.03, 'sports': 0.01, ...} Top label: politics
Scores: {'politics': 0.95, 'weather': 0.03, 'sports': 0.01, 'entertainment': 0.01} from openai import OpenAI
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
def few_shot_classify(text, candidate_labels, examples):
"""Few-shot classification via in-context learning."""
examples_text = "\n".join(
[f"Text: {ex['text']}\nLabel: {ex['label']}" for ex in examples]
)
prompt = f"""Classify the following text into one of: {', '.join(candidate_labels)}.
Examples:
{examples_text}
Now classify this:
Text: {text}
Label:"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
temperature=0
)
return response.choices[0].message.content.strip()
# Usage
examples = [
{"text": "I love this!", "label": "positive"},
{"text": "Terrible experience.", "label": "negative"},
{"text": "It's okay.", "label": "neutral"},
]
result = few_shot_classify(
"Best purchase ever!",
["positive", "negative", "neutral"],
examples
)
print(result) # Output: "positive" positive Common Errors & Fixes
Model returns wrong label not in candidate_labels Cause: Prompt is too vague, candidate labels are confusing or overlapping, or model hallucinates. LLMs are not constrained to output only allowed labels.
Add explicit constraint to prompt and parse/validate output. Use structured extraction with JSON mode:
from openai import OpenAI
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
def safe_classify(text, labels):
labels_json = json.dumps(labels)
response = client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "user",
"content": f"""Classify '{text}' as one of {labels_json}. Respond ONLY with valid JSON: {{"label": "<one from list>"}}"""
}],
response_format={"type": "json_object"},
temperature=0
)
result = json.loads(response.choices[0].message.content)
if result["label"] not in labels:
return labels[0] # Fallback to first label
return result["label"] Accuracy drops dramatically with new domain Cause: Zero-shot relies on pre-training; unfamiliar domains or niche vocabulary exceed model knowledge. Generic label names don't capture domain semantics.
Use descriptive labels with context instead of short class names:
# Bad: labels = ["A", "B", "C"]
# Good:
labels = [
"Customer complaint about billing or payment",
"Product quality or defect issue",
"Feature request or enhancement suggestion"
]
Or use few-shot with 3–5 domain examples to ground the model. Inconsistent predictions on similar inputs Cause: Model is non-deterministic; temperature > 0 adds randomness. Prompt is ambiguous so model picks different interpretations.
Set temperature=0 for deterministic results. Also clarify prompt and label descriptions:
response = client.chat.completions.create(
model="gpt-4o",
messages=[...],
temperature=0 # Deterministic
)
Also add explicit decision rule to prompt: "If uncertain, choose X." HuggingFace model OOM on large label sets Cause: Entailment models compute scores for every label; 100+ labels = huge computation graph.
Limit candidate labels to top 10–20 by semantic relevance. Or use hierarchical approach: coarse classification first, then fine-grained:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
# Coarse classification
coarse_result = classifier(text, ["Technical", "Non-Technical"])
if coarse_result["labels"][0] == "Technical":
# Fine-grained
result = classifier(text, ["Bug", "Feature", "Performance"])
else:
result = classifier(text, ["Marketing", "Sales", "HR"]) Production Gotchas
Zero-shot depends on model understanding of labels. "Class A" is useless; "Customer complaint about billing" is 5x more accurate. Use natural language descriptions that capture intent.
Bad examples teach wrong patterns. If your 3 examples are all sarcasm-heavy, model will over-interpret sarcasm in new inputs. Curate examples to represent true distribution.
GPT-4o knows more than local BERT models. If your labels reference recent events or niche jargon, LLM-based zero-shot outperforms small Hugging Face models.
Labels like "positive", "very positive", "happy" are semantically close; model may rank them unpredictably. Use mutually exclusive, well-separated categories.
Zero-shot via LLM API charges per token. Long documents + multiple classification passes = high cost. Truncate or batch; consider local models for cost-sensitive workloads.
HuggingFace entailment models use `hypothesis_template=` to frame classification. "This text is about {}" vs "It is {}" changes scores. Test multiple templates.
Complete end-to-end zero-shot pipeline: load text, classify, validate, and log confidence.
from openai import OpenAI
import json
import os
from typing import NamedTuple
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
class ClassificationResult(NamedTuple):
text: str
predicted_label: str
confidence: float
reasoning: str
is_valid: bool
def zero_shot_classify_pipeline(
text: str,
candidate_labels: list[str],
confidence_threshold: float = 0.7
) -> ClassificationResult:
"""Production-ready zero-shot classifier with validation."""
# Build prompt
labels_str = ", ".join(candidate_labels)
prompt = f"""Classify the following text into ONE category from: {labels_str}.
Text: {text}
Respond in JSON format:
{{
"label": "<chosen label>",
"confidence": <0.0-1.0>,
"reasoning": "<brief explanation>"
}}"""
# Query model
try:
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
temperature=0
)
content = response.choices[0].message.content.strip()
result_json = json.loads(content)
except (json.JSONDecodeError, KeyError) as e:
print(f"Parse error: {e}")
return ClassificationResult(
text=text,
predicted_label="ERROR",
confidence=0.0,
reasoning="Failed to parse model response",
is_valid=False
)
# Validate and return
label = result_json.get("label", "").strip()
confidence = float(result_json.get("confidence", 0.0))
reasoning = result_json.get("reasoning", "")
is_valid = (
label in candidate_labels and
confidence >= confidence_threshold
)
return ClassificationResult(
text=text,
predicted_label=label if is_valid else candidate_labels[0],
confidence=confidence,
reasoning=reasoning,
is_valid=is_valid
)
# Usage
if __name__ == "__main__":
texts = [
"The API is down and customers cannot login.",
"We should add dark mode to the app.",
"Terrible support experience, never coming back."
]
labels = ["bug", "feature_request", "complaint"]
for text in texts:
result = zero_shot_classify_pipeline(text, labels)
print(f"Text: {result.text}")
print(f"Label: {result.predicted_label} (confidence: {result.confidence:.2f})")
print(f"Valid: {result.is_valid}\n")