Test Generation Cheat Sheet — AI-Powered Test Coverage
Use LLMs to write test cases based on code structure, requirements, and edge case patterns.
Like a code reviewer who knows every edge case: the LLM reads your function and writes the tests a senior engineer would catch, but 100x faster.
Key Concepts
Test Generation Patterns
import os
from openai import OpenAI
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
function_code = '''
def calculate_discount(price: float, is_member: bool) -> float:
"""Apply 10% discount if member, 5% if not."""
if price < 0:
raise ValueError("Price cannot be negative")
return price * (0.9 if is_member else 0.95)
'''
prompt = f"""Generate pytest unit tests for this function. Include:
- Happy path (member and non-member)
- Edge case: price=0
- Error: negative price
- Type validation
{function_code}
Output ONLY valid Python pytest code."""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
temperature=0.7
)
generated_tests = response.choices[0].message.content
print(generated_tests) import pytest
from module import calculate_discount
def test_calculate_discount_member():
assert calculate_discount(100.0, True) == 90.0
def test_calculate_discount_non_member():
assert calculate_discount(100.0, False) == 95.0
def test_calculate_discount_zero():
assert calculate_discount(0.0, True) == 0.0
def test_calculate_discount_negative_price():
with pytest.raises(ValueError):
calculate_discount(-10.0, True) import json
import os
from openai import OpenAI
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
def refine_tests_for_coverage(code: str, current_coverage: float, target: float = 90.0):
"""Iteratively generate tests until coverage threshold is met."""
tests = []
iteration = 0
while current_coverage < target and iteration < 5:
prompt = f"""Code coverage is {current_coverage}%. Generate 3 new pytest test cases
to reach {target}% coverage. Focus on untested branches.
Existing code:\n{code}
Current tests: {len(tests)}
Target coverage: {target}%
Generate ONLY new test functions, not duplicates."""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
temperature=0.8
)
new_tests = response.choices[0].message.content
tests.append(new_tests)
# In production: run pytest --cov, parse output
# current_coverage = run_coverage_check(code, tests)
iteration += 1
return tests
# Usage
test_suite = refine_tests_for_coverage(
code="def process_order(items): ...",
current_coverage=65.0,
target=90.0
) import os
from openai import OpenAI
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
function_spec = '''
def normalize_email(email: str) -> str:
"""Convert email to lowercase, strip whitespace, validate format."""
email = email.strip().lower()
if '@' not in email or '.' not in email.split('@')[1]:
raise ValueError("Invalid email")
return email
'''
prompt = f"""Generate Hypothesis property-based tests for this function.
Include invariants that MUST always hold true (e.g., output is lowercase).
{function_spec}
Example invariant: output should always be lowercase and equal to input.strip().lower()
Output a complete, runnable Hypothesis test."""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}]
)
print(response.choices[0].message.content) from hypothesis import given, strategies as st
from module import normalize_email
@given(st.emails())
def test_normalize_email_lowercase(email):
result = normalize_email(email)
assert result == result.lower()
@given(st.text())
def test_normalize_email_idempotent(text):
try:
result = normalize_email(text)
# Normalizing twice should give same result
assert normalize_email(result) == result
except ValueError:
pass import json
import os
from openai import OpenAI
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
openapi_spec = {
"paths": {
"/users/{id}": {
"get": {
"parameters": [{"name": "id", "in": "path", "schema": {"type": "integer"}}],
"responses": {
"200": {"description": "User found"},
"404": {"description": "User not found"},
"401": {"description": "Unauthorized"}
}
}
}
}
}
prompt = f"""Generate pytest tests for this API endpoint scenario:
1. Valid user ID → 200 response with user object
2. Non-existent user → 404 response
3. Missing auth header → 401 response
4. Malformed ID → 400 response
5. Boundary: ID = 0, ID = 999999999
API Spec:\n{json.dumps(openapi_spec, indent=2)}
Generate complete pytest tests using requests mock or responses library."""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}]
)
print(response.choices[0].message.content) import subprocess
import json
import os
from openai import OpenAI
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
def validate_with_mutations(test_file: str, code_file: str):
"""Run mutation testing to ensure tests catch mutations."""
# Install: pip install mutmut
result = subprocess.run(
["mutmut", "run", "--tests-dir", "tests", "--paths-to-mutate", code_file],
capture_output=True,
text=True
)
output = json.loads(result.stdout)
total_mutants = output.get("total", 0)
killed = output.get("killed", 0)
mutation_score = (killed / total_mutants * 100) if total_mutants > 0 else 0
if mutation_score < 80:
# Ask LLM to improve tests
prompt = f"""Your tests have {mutation_score}% mutation score (should be >80%).
These mutations survived:\n{result.stdout}
Generate additional tests to kill more mutants."""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
return f"Mutation score: {mutation_score}% ✓"
# Usage
print(validate_with_mutations("tests/test_module.py", "src/module.py")) Test Generation Comparison
| Approach | Speed | Coverage | Maintainability | Best For |
|---|
Common Errors & Fixes
Generated tests pass but don't catch real bugs Cause: LLM generates tests that exercise code paths but use weak assertions (only checking return type, not value). Tests never fail unless there's a syntax error.
Validate tests against mutation testing. Inject intentional bugs into source code and verify tests fail.
import subprocess
import tempfile
import os
def validate_tests_catch_bugs(test_file, code_file, code_with_bug):
"""Ensure tests fail when code is broken."""
# Create temp file with intentional bug
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as tmp:
tmp.write(code_with_bug)
tmp.flush()
# Run tests against broken code
result = subprocess.run(
['pytest', test_file, '--tb=short'],
capture_output=True,
text=True
)
os.unlink(tmp.name)
if result.returncode == 0:
raise AssertionError("Tests passed even with intentional bug! Tests are too weak.")
return f"✓ Tests correctly caught bug: {result.stdout[:200]}" LLM imports non-existent or outdated test libraries Cause: Training data includes old pytest plugins, removed libraries, or hallucinated package names. Generated tests fail immediately on import.
Pre-validate imports by parsing generated code before execution.
import ast
import importlib
def validate_imports(test_code: str) -> list:
"""Check all imports exist before running tests."""
tree = ast.parse(test_code)
missing = []
for node in ast.walk(tree):
if isinstance(node, ast.Import):
for alias in node.names:
try:
importlib.import_module(alias.name)
except ImportError:
missing.append(alias.name)
if missing:
raise ImportError(f"Generated test imports non-existent packages: {missing}")
return missing
# Usage before executing tests
validate_imports(generated_test_code) Mock setup doesn't match real system behavior Cause: LLM mocks external services incorrectly (wrong return types, missing side effects) so tests pass in isolation but fail in integration.
Use recorded VCR cassettes and contract testing instead of LLM-generated mocks.
# Install: pip install vcrpy
import vcr
my_vcr = vcr.VCR(record_mode='once')
@my_vcr.use_cassette('api_responses.yaml')
def test_with_real_recorded_response():
"""Uses recorded HTTP responses, not LLM-generated mocks."""
response = requests.get('https://api.example.com/users')
assert response.status_code == 200
# For contract testing:
# Use Pact to verify mock matches real API contract
import json
from pact import Consumer, Provider
pact = Consumer('TestClient').has_state(
'user exists',
upon_receiving='a request for user 123'
).will_respond_with('a 200 response with user data') Coverage metrics mislead: 100% coverage, 0% mutation score Cause: Generated tests exercise all lines but use no assertions. Code runs without error = test passes, but doesn't verify correctness.
Always pair coverage with mutation testing. Require both metrics.
# Install: pip install mutmut pytest-cov
import subprocess
def enforce_quality_gates(threshold_coverage=90, threshold_mutation=80):
"""Fail CI if either metric is below threshold."""
# Line coverage
cov_result = subprocess.run(
['pytest', '--cov=src', '--cov-report=json'],
capture_output=True
)
coverage = json.load(open('coverage.json'))
if coverage['totals']['percent_covered'] < threshold_coverage:
raise AssertionError(f"Coverage {coverage['totals']['percent_covered']}% < {threshold_coverage}%")
# Mutation score
mut_result = subprocess.run(['mutmut', 'run', '--json'], capture_output=True)
mutation_data = json.loads(mut_result.stdout)
killed = sum(1 for m in mutation_data if m.get('status') == 'killed')
score = (killed / len(mutation_data)) * 100
if score < threshold_mutation:
raise AssertionError(f"Mutation score {score}% < {threshold_mutation}%")
print(f"✓ Coverage: {coverage['totals']['percent_covered']}% | Mutation: {score}%") Production Gotchas
Claude or GPT-4o may generate tests for features your code doesn't have (e.g., async when function is sync, or Redis caching when you use in-memory). Always review generated tests against actual requirements. Use structured prompts with explicit constraints: 'Do NOT generate tests for async: this is synchronous code.'
A test passes against a perfect mock but fails against real external service (wrong field names, different error codes, timeout behavior). Use contract testing (Pact, Spring Cloud Contract) instead of LLM-generated mocks. Record real interactions once, replay them in tests.
Asking GPT-4o to generate tests for a 10,000-line module hits token limits and produces incomplete tests. Break code into smaller units (functions ~20 lines each) or use tool-specific test generators (pytest-asyncio, hypothesis discovery) instead of monolithic LLM calls.
LLM generates 'happy path' tests but skips boundary conditions (INT_MAX, empty strings, None values). Explicitly ask for: 'Include tests for: empty input, None, zero, negative values, max int.'
mutmut runs your code thousands of times: if your test suite already takes 5 minutes, mutation testing takes hours. Use sampling (mutmut --paths-to-mutate only changed files) or split into fast mutation checks (arithmetic operators) vs slow checks (all operators).
LLM generates `import pytest; from mock import patch` but your CI runs pytest 8.x (mock removed) or unittest.mock is the stdlib standard. Always validate generated imports work in your target Python version.
End-to-End Test Generation Pipeline: Input Code → LLM → Generated Tests → Validation → Coverage Report
import os
import json
import subprocess
import tempfile
from openai import OpenAI
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
def generate_and_validate_tests(source_code: str, module_name: str):
"""
Complete pipeline: generate tests, validate syntax, run coverage, report mutation score.
"""
print("[1] Generating tests with LLM...")
prompt = f"""Generate comprehensive pytest unit tests for this Python code.
Include:
- Happy path
- Edge cases (zero, None, empty, max values)
- Error conditions
- Type validation
Output ONLY valid, runnable pytest code. No explanations.
```python
{source_code}
```"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
temperature=0.7
)
test_code = response.choices[0].message.content
# Remove markdown fencing if present
if test_code.startswith('```'):
test_code = test_code.split('```')[1]
if test_code.startswith('python'):
test_code = test_code[6:]
print("[2] Validating test syntax...")
try:
compile(test_code, "<tests>", "exec")
print("✓ Syntax valid")
except SyntaxError as e:
print(f"✗ Syntax error in generated tests: {e}")
return None
print("[3] Writing to temp files and running tests...")
with tempfile.TemporaryDirectory() as tmpdir:
# Write source code
src_path = os.path.join(tmpdir, f"{module_name}.py")
with open(src_path, 'w') as f:
f.write(source_code)
# Write test code
test_path = os.path.join(tmpdir, f"test_{module_name}.py")
with open(test_path, 'w') as f:
f.write(test_code)
# Run pytest with coverage
print("[4] Running tests with coverage...")
result = subprocess.run(
["pytest", test_path, "--cov=" + module_name,
"--cov-report=json", "-v"],
capture_output=True,
text=True,
cwd=tmpdir
)
print(result.stdout)
if result.returncode != 0:
print(f"✗ Test execution failed: {result.stderr}")
return None
# Parse coverage
try:
with open(os.path.join(tmpdir, ".coverage.json")) as f:
cov_data = json.load(f)
coverage_pct = cov_data["totals"]["percent_covered"]
print(f"[5] Code Coverage: {coverage_pct}%")
except:
coverage_pct = "N/A"
# Run mutation testing
print("[6] Running mutation testing...")
mut_result = subprocess.run(
["mutmut", "run", "--paths-to-mutate", module_name],
capture_output=True,
text=True,
cwd=tmpdir
)
if "total" in mut_result.stdout:
try:
mut_data = json.loads(mut_result.stdout)
killed = mut_data.get("killed", 0)
total = mut_data.get("total", 1)
mutation_score = (killed / total * 100) if total > 0 else 0
print(f"[7] Mutation Score: {mutation_score:.1f}% ({killed}/{total} killed)")
except:
print("[7] Mutation testing failed to parse")
print(f"\n{'='*60}")
print(f"Generated tests ({len(test_code)} chars)")
print(f"Coverage: {coverage_pct}% | Mutation: {mutation_score:.1f}%")
print(f"{'='*60}")
return {
"test_code": test_code,
"coverage": coverage_pct,
"mutation_score": mutation_score
}
# Example: Generate tests for a simple function
sample_code = '''
def validate_email(email: str) -> bool:
"""Return True if email format is valid, False otherwise."""
if not isinstance(email, str):
raise TypeError("Email must be string")
if not email or '@' not in email:
return False
local, domain = email.rsplit('@', 1)
if not local or not domain or '.' not in domain:
return False
return True
'''
result = generate_and_validate_tests(sample_code, "email_validator")
if result:
print("\nGenerated test file ready for production:")
print(result["test_code"][:500] + "...")