Cheat Sheet intermediate · 8 min read

Test Generation Cheat Sheet — AI-Powered Test Coverage

version current

Generate comprehensive test suites automatically with AI

Mental model

Use LLMs to write test cases based on code structure, requirements, and edge case patterns.

Like a code reviewer who knows every edge case: the LLM reads your function and writes the tests a senior engineer would catch, but 100x faster.

Key Concepts

Unit Test Generation

AI-generated tests for individual functions with mocked dependencies, covering happy path, edge cases, and error conditions.

Integration Test Generation

Tests that verify how multiple components interact, typically generated by analyzing component boundaries and data flows.

Property-Based Testing

LLM-generated parameterized tests that verify invariants hold across many random inputs using tools like Hypothesis.

Coverage-Guided Generation

Iterative test generation that targets code paths not yet covered, maximizing branch and path coverage automatically.

Mutation Testing

Generated tests validated against intentional code mutations to ensure tests actually catch bugs, not just run without errors.

Scenario-Based Generation

Tests derived from user stories, API specs, or domain requirements, rather than just structural code analysis.

Test Generation Patterns

01 LLM-Driven Test Generation

Generate unit tests from function signature and docstring

python

import os
from openai import OpenAI

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

function_code = '''
def calculate_discount(price: float, is_member: bool) -> float:
    """Apply 10% discount if member, 5% if not."""
    if price < 0:
        raise ValueError("Price cannot be negative")
    return price * (0.9 if is_member else 0.95)
'''

prompt = f"""Generate pytest unit tests for this function. Include:
- Happy path (member and non-member)
- Edge case: price=0
- Error: negative price
- Type validation

{function_code}

Output ONLY valid Python pytest code."""

response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": prompt}],
    temperature=0.7
)

generated_tests = response.choices[0].message.content
print(generated_tests)

output

import pytest
from module import calculate_discount

def test_calculate_discount_member():
    assert calculate_discount(100.0, True) == 90.0

def test_calculate_discount_non_member():
    assert calculate_discount(100.0, False) == 95.0

def test_calculate_discount_zero():
    assert calculate_discount(0.0, True) == 0.0

def test_calculate_discount_negative_price():
    with pytest.raises(ValueError):
        calculate_discount(-10.0, True)

LLM generates syntactically correct but semantically wrong assertions. Always validate generated tests execute and fail with intentionally broken code.

02 Coverage-Guided Test Refinement

Iteratively generate tests to reach target coverage threshold

python

import json
import os
from openai import OpenAI

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def refine_tests_for_coverage(code: str, current_coverage: float, target: float = 90.0):
    """Iteratively generate tests until coverage threshold is met."""
    tests = []
    iteration = 0
    
    while current_coverage < target and iteration < 5:
        prompt = f"""Code coverage is {current_coverage}%. Generate 3 new pytest test cases
to reach {target}% coverage. Focus on untested branches.

Existing code:\n{code}

Current tests: {len(tests)}
Target coverage: {target}%

Generate ONLY new test functions, not duplicates."""
        
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.8
        )
        
        new_tests = response.choices[0].message.content
        tests.append(new_tests)
        
        # In production: run pytest --cov, parse output
        # current_coverage = run_coverage_check(code, tests)
        iteration += 1
    
    return tests

# Usage
test_suite = refine_tests_for_coverage(
    code="def process_order(items): ...",
    current_coverage=65.0,
    target=90.0
)

Blindly chasing coverage metrics generates useless tests. Pair with mutation testing to verify tests actually catch bugs.

03 Property-Based Test Generation

Generate Hypothesis strategies for parameterized testing

python

import os
from openai import OpenAI

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

function_spec = '''
def normalize_email(email: str) -> str:
    """Convert email to lowercase, strip whitespace, validate format."""
    email = email.strip().lower()
    if '@' not in email or '.' not in email.split('@')[1]:
        raise ValueError("Invalid email")
    return email
'''

prompt = f"""Generate Hypothesis property-based tests for this function.
Include invariants that MUST always hold true (e.g., output is lowercase).

{function_spec}

Example invariant: output should always be lowercase and equal to input.strip().lower()

Output a complete, runnable Hypothesis test."""

response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": prompt}]
)

print(response.choices[0].message.content)

output

from hypothesis import given, strategies as st
from module import normalize_email

@given(st.emails())
def test_normalize_email_lowercase(email):
    result = normalize_email(email)
    assert result == result.lower()

@given(st.text())
def test_normalize_email_idempotent(text):
    try:
        result = normalize_email(text)
        # Normalizing twice should give same result
        assert normalize_email(result) == result
    except ValueError:
        pass

Hypothesis generates many test cases: slow if LLM-based strategies aren't cached. Pre-generate once, run 1000x.

04 Scenario-Based Test Generation from Specs

Generate integration tests from OpenAPI/API specification

python

import json
import os
from openai import OpenAI

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

openapi_spec = {
    "paths": {
        "/users/{id}": {
            "get": {
                "parameters": [{"name": "id", "in": "path", "schema": {"type": "integer"}}],
                "responses": {
                    "200": {"description": "User found"},
                    "404": {"description": "User not found"},
                    "401": {"description": "Unauthorized"}
                }
            }
        }
    }
}

prompt = f"""Generate pytest tests for this API endpoint scenario:

1. Valid user ID → 200 response with user object
2. Non-existent user → 404 response
3. Missing auth header → 401 response
4. Malformed ID → 400 response
5. Boundary: ID = 0, ID = 999999999

API Spec:\n{json.dumps(openapi_spec, indent=2)}

Generate complete pytest tests using requests mock or responses library."""

response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": prompt}]
)

print(response.choices[0].message.content)

Generated integration tests depend on mock setup. LLM may not mock external services correctly: always validate mocks match real API behavior.

05 Validate Tests with Mutation Testing

Verify generated tests catch bugs using mutation operators

python

import subprocess
import json
import os
from openai import OpenAI

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def validate_with_mutations(test_file: str, code_file: str):
    """Run mutation testing to ensure tests catch mutations."""
    # Install: pip install mutmut
    result = subprocess.run(
        ["mutmut", "run", "--tests-dir", "tests", "--paths-to-mutate", code_file],
        capture_output=True,
        text=True
    )
    
    output = json.loads(result.stdout)
    total_mutants = output.get("total", 0)
    killed = output.get("killed", 0)
    
    mutation_score = (killed / total_mutants * 100) if total_mutants > 0 else 0
    
    if mutation_score < 80:
        # Ask LLM to improve tests
        prompt = f"""Your tests have {mutation_score}% mutation score (should be >80%).
These mutations survived:\n{result.stdout}

Generate additional tests to kill more mutants."""
        
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content
    
    return f"Mutation score: {mutation_score}% ✓"

# Usage
print(validate_with_mutations("tests/test_module.py", "src/module.py"))

High coverage ≠ high mutation score. Generated tests often fail mutation testing because they don't assert specific values.

Test Generation Comparison

Approach	Speed	Coverage	Maintainability	Best For

Common Errors & Fixes

01 Generated tests pass but don't catch real bugs

Cause: LLM generates tests that exercise code paths but use weak assertions (only checking return type, not value). Tests never fail unless there's a syntax error.

Fix:

python

Validate tests against mutation testing. Inject intentional bugs into source code and verify tests fail.

import subprocess
import tempfile
import os

def validate_tests_catch_bugs(test_file, code_file, code_with_bug):
    """Ensure tests fail when code is broken."""
    # Create temp file with intentional bug
    with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as tmp:
        tmp.write(code_with_bug)
        tmp.flush()
        
        # Run tests against broken code
        result = subprocess.run(
            ['pytest', test_file, '--tb=short'],
            capture_output=True,
            text=True
        )
        
        os.unlink(tmp.name)
    
    if result.returncode == 0:
        raise AssertionError("Tests passed even with intentional bug! Tests are too weak.")
    return f"✓ Tests correctly caught bug: {result.stdout[:200]}"

02 LLM imports non-existent or outdated test libraries

Cause: Training data includes old pytest plugins, removed libraries, or hallucinated package names. Generated tests fail immediately on import.

Fix:

python

Pre-validate imports by parsing generated code before execution.

import ast
import importlib

def validate_imports(test_code: str) -> list:
    """Check all imports exist before running tests."""
    tree = ast.parse(test_code)
    missing = []
    
    for node in ast.walk(tree):
        if isinstance(node, ast.Import):
            for alias in node.names:
                try:
                    importlib.import_module(alias.name)
                except ImportError:
                    missing.append(alias.name)
    
    if missing:
        raise ImportError(f"Generated test imports non-existent packages: {missing}")
    return missing

# Usage before executing tests
validate_imports(generated_test_code)

03 Mock setup doesn't match real system behavior

Cause: LLM mocks external services incorrectly (wrong return types, missing side effects) so tests pass in isolation but fail in integration.

Fix:

python

Use recorded VCR cassettes and contract testing instead of LLM-generated mocks.

# Install: pip install vcrpy
import vcr

my_vcr = vcr.VCR(record_mode='once')

@my_vcr.use_cassette('api_responses.yaml')
def test_with_real_recorded_response():
    """Uses recorded HTTP responses, not LLM-generated mocks."""
    response = requests.get('https://api.example.com/users')
    assert response.status_code == 200

# For contract testing:
# Use Pact to verify mock matches real API contract
import json
from pact import Consumer, Provider

pact = Consumer('TestClient').has_state(
    'user exists',
    upon_receiving='a request for user 123'
).will_respond_with('a 200 response with user data')

04 Coverage metrics mislead: 100% coverage, 0% mutation score

Cause: Generated tests exercise all lines but use no assertions. Code runs without error = test passes, but doesn't verify correctness.

Fix:

python

Always pair coverage with mutation testing. Require both metrics.

# Install: pip install mutmut pytest-cov
import subprocess

def enforce_quality_gates(threshold_coverage=90, threshold_mutation=80):
    """Fail CI if either metric is below threshold."""
    # Line coverage
    cov_result = subprocess.run(
        ['pytest', '--cov=src', '--cov-report=json'],
        capture_output=True
    )
    coverage = json.load(open('coverage.json'))
    if coverage['totals']['percent_covered'] < threshold_coverage:
        raise AssertionError(f"Coverage {coverage['totals']['percent_covered']}% < {threshold_coverage}%")
    
    # Mutation score
    mut_result = subprocess.run(['mutmut', 'run', '--json'], capture_output=True)
    mutation_data = json.loads(mut_result.stdout)
    killed = sum(1 for m in mutation_data if m.get('status') == 'killed')
    score = (killed / len(mutation_data)) * 100
    if score < threshold_mutation:
        raise AssertionError(f"Mutation score {score}% < {threshold_mutation}%")
    
    print(f"✓ Coverage: {coverage['totals']['percent_covered']}% | Mutation: {score}%")

Production Gotchas

⚠ LLM hallucinates test scenarios that don't apply to your code

Claude or GPT-4o may generate tests for features your code doesn't have (e.g., async when function is sync, or Redis caching when you use in-memory). Always review generated tests against actual requirements. Use structured prompts with explicit constraints: 'Do NOT generate tests for async: this is synchronous code.'

⚠ Generated mocks hide integration bugs

A test passes against a perfect mock but fails against real external service (wrong field names, different error codes, timeout behavior). Use contract testing (Pact, Spring Cloud Contract) instead of LLM-generated mocks. Record real interactions once, replay them in tests.

⚠ Test generation explodes with large codebases

Asking GPT-4o to generate tests for a 10,000-line module hits token limits and produces incomplete tests. Break code into smaller units (functions ~20 lines each) or use tool-specific test generators (pytest-asyncio, hypothesis discovery) instead of monolithic LLM calls.

⚠ Edge case assertions are often missing

LLM generates 'happy path' tests but skips boundary conditions (INT_MAX, empty strings, None values). Explicitly ask for: 'Include tests for: empty input, None, zero, negative values, max int.'

⚠ Mutation testing becomes the bottleneck

mutmut runs your code thousands of times: if your test suite already takes 5 minutes, mutation testing takes hours. Use sampling (mutmut --paths-to-mutate only changed files) or split into fast mutation checks (arithmetic operators) vs slow checks (all operators).

⚠ Dependency versions in generated imports drift

LLM generates `import pytest; from mock import patch` but your CI runs pytest 8.x (mock removed) or unittest.mock is the stdlib standard. Always validate generated imports work in your target Python version.

End-to-End Test Generation Pipeline: Input Code → LLM → Generated Tests → Validation → Coverage Report

python

import os
import json
import subprocess
import tempfile
from openai import OpenAI

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def generate_and_validate_tests(source_code: str, module_name: str):
    """
    Complete pipeline: generate tests, validate syntax, run coverage, report mutation score.
    """
    print("[1] Generating tests with LLM...")
    
    prompt = f"""Generate comprehensive pytest unit tests for this Python code.
Include:
- Happy path
- Edge cases (zero, None, empty, max values)
- Error conditions
- Type validation

Output ONLY valid, runnable pytest code. No explanations.

```python
{source_code}
```"""
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7
    )
    
    test_code = response.choices[0].message.content
    # Remove markdown fencing if present
    if test_code.startswith('```'):
        test_code = test_code.split('```')[1]
        if test_code.startswith('python'):
            test_code = test_code[6:]
    
    print("[2] Validating test syntax...")
    
    try:
        compile(test_code, "<tests>", "exec")
        print("✓ Syntax valid")
    except SyntaxError as e:
        print(f"✗ Syntax error in generated tests: {e}")
        return None
    
    print("[3] Writing to temp files and running tests...")
    
    with tempfile.TemporaryDirectory() as tmpdir:
        # Write source code
        src_path = os.path.join(tmpdir, f"{module_name}.py")
        with open(src_path, 'w') as f:
            f.write(source_code)
        
        # Write test code
        test_path = os.path.join(tmpdir, f"test_{module_name}.py")
        with open(test_path, 'w') as f:
            f.write(test_code)
        
        # Run pytest with coverage
        print("[4] Running tests with coverage...")
        result = subprocess.run(
            ["pytest", test_path, "--cov=" + module_name,
             "--cov-report=json", "-v"],
            capture_output=True,
            text=True,
            cwd=tmpdir
        )
        
        print(result.stdout)
        if result.returncode != 0:
            print(f"✗ Test execution failed: {result.stderr}")
            return None
        
        # Parse coverage
        try:
            with open(os.path.join(tmpdir, ".coverage.json")) as f:
                cov_data = json.load(f)
                coverage_pct = cov_data["totals"]["percent_covered"]
                print(f"[5] Code Coverage: {coverage_pct}%")
        except:
            coverage_pct = "N/A"
        
        # Run mutation testing
        print("[6] Running mutation testing...")
        mut_result = subprocess.run(
            ["mutmut", "run", "--paths-to-mutate", module_name],
            capture_output=True,
            text=True,
            cwd=tmpdir
        )
        
        if "total" in mut_result.stdout:
            try:
                mut_data = json.loads(mut_result.stdout)
                killed = mut_data.get("killed", 0)
                total = mut_data.get("total", 1)
                mutation_score = (killed / total * 100) if total > 0 else 0
                print(f"[7] Mutation Score: {mutation_score:.1f}% ({killed}/{total} killed)")
            except:
                print("[7] Mutation testing failed to parse")
        
        print(f"\n{'='*60}")
        print(f"Generated tests ({len(test_code)} chars)")
        print(f"Coverage: {coverage_pct}% | Mutation: {mutation_score:.1f}%")
        print(f"{'='*60}")
        
        return {
            "test_code": test_code,
            "coverage": coverage_pct,
            "mutation_score": mutation_score
        }

# Example: Generate tests for a simple function
sample_code = '''
def validate_email(email: str) -> bool:
    """Return True if email format is valid, False otherwise."""
    if not isinstance(email, str):
        raise TypeError("Email must be string")
    if not email or '@' not in email:
        return False
    local, domain = email.rsplit('@', 1)
    if not local or not domain or '.' not in domain:
        return False
    return True
'''

result = generate_and_validate_tests(sample_code, "email_validator")
if result:
    print("\nGenerated test file ready for production:")
    print(result["test_code"][:500] + "...")

Verified 2026-04 · gpt-4o, gpt-4o-mini, claude-3-5-sonnet-20241022

Verify ↗

Community Notes

No notes yetBe the first to share a version-specific fix or tip.