Code beginner · 3 min read

How to summarize PDF with Python

Direct answer

Use Python libraries like PyPDF2 or pdfplumber to extract text from PDFs, then send the extracted text to an AI model like gpt-4o via the OpenAI Python SDK to generate a summary.

Setup

Install

bash

pip install openai PyPDF2

Env vars

OPENAI_API_KEY

Imports

python

import os
from openai import OpenAI
import PyPDF2

Examples

inA PDF with a 2-page article about climate change.

outSummary: The article discusses the causes and effects of climate change, emphasizing the role of greenhouse gases and the need for global action.

inA PDF containing a product manual of 10 pages.

outSummary: This manual provides step-by-step instructions for setting up and operating the product safely and efficiently.

inA scanned PDF with mostly images and little text.

outSummary: The PDF contains mostly images; text extraction was limited, so a detailed summary is not available.

Integration steps

Install PyPDF2 and openai Python packages.
Extract text from the PDF file using PyPDF2.
Initialize the OpenAI client with OPENAI_API_KEY from environment variables.
Send the extracted text as a prompt to the gpt-4o model for summarization.
Parse and display the summary from the API response.

Full code

python

import os
from openai import OpenAI
import PyPDF2

# Initialize OpenAI client
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

# Function to extract text from PDF

def extract_text_from_pdf(pdf_path: str) -> str:
    text = []
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text.append(page_text)
    return "\n".join(text)

# Summarize extracted text using OpenAI

def summarize_text(text: str) -> str:
    prompt = f"Summarize the following text:\n\n{text}"
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=300
    )
    return response.choices[0].message.content.strip()

# Main execution

if __name__ == "__main__":
    pdf_file = "sample.pdf"  # Replace with your PDF file path
    extracted_text = extract_text_from_pdf(pdf_file)
    if not extracted_text.strip():
        print("No extractable text found in the PDF.")
    else:
        summary = summarize_text(extracted_text)
        print("Summary:\n", summary)

output

Summary:
The article discusses the causes and effects of climate change, emphasizing the role of greenhouse gases and the need for global action.

API trace

Request

json

{"model": "gpt-4o", "messages": [{"role": "user", "content": "Summarize the following text:\n\n<extracted PDF text>"}], "max_tokens": 300}

Response

json

{"choices": [{"message": {"content": "<summary text>"}}], "usage": {"prompt_tokens": 500, "completion_tokens": 150, "total_tokens": 650}}

Extractresponse.choices[0].message.content

Variants

Streaming summary output ›

Use streaming to display the summary progressively for large PDF texts or better user experience.

python

import os
from openai import OpenAI
import PyPDF2

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def extract_text_from_pdf(pdf_path: str) -> str:
    text = []
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text.append(page_text)
    return "\n".join(text)

def summarize_text_stream(text: str):
    prompt = f"Summarize the following text:\n\n{text}"
    stream = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=300,
        stream=True
    )
    for chunk in stream:
        delta = chunk.choices[0].delta.content or ""
        print(delta, end="", flush=True)

if __name__ == "__main__":
    pdf_file = "sample.pdf"
    extracted_text = extract_text_from_pdf(pdf_file)
    if not extracted_text.strip():
        print("No extractable text found in the PDF.")
    else:
        summarize_text_stream(extracted_text)

Async PDF summarization ›

Use async version for integration in async web servers or concurrent workflows.

python

import os
import asyncio
from openai import OpenAI
import PyPDF2

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def extract_text_from_pdf(pdf_path: str) -> str:
    text = []
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text.append(page_text)
    return "\n".join(text)

async def summarize_text_async(text: str) -> str:
    prompt = f"Summarize the following text:\n\n{text}"
    response = await client.chat.completions.acreate(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=300
    )
    return response.choices[0].message.content.strip()

async def main():
    pdf_file = "sample.pdf"
    extracted_text = extract_text_from_pdf(pdf_file)
    if not extracted_text.strip():
        print("No extractable text found in the PDF.")
        return
    summary = await summarize_text_async(extracted_text)
    print("Summary:\n", summary)

if __name__ == "__main__":
    asyncio.run(main())

Use Anthropic Claude for summarization ›

Use Anthropic Claude models if you prefer their summarization style or have an Anthropic API key.

python

import os
import anthropic

client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])

def extract_text_from_pdf(pdf_path: str) -> str:
    import PyPDF2
    text = []
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text.append(page_text)
    return "\n".join(text)

def summarize_text_claude(text: str) -> str:
    system_prompt = "You are a helpful assistant that summarizes text concisely."
    user_prompt = f"Summarize the following text:\n\n{text}"
    response = client.messages.create(
        model="claude-3-5-sonnet-20241022",
        max_tokens=300,
        system=system_prompt,
        messages=[{"role": "user", "content": user_prompt}]
    )
    return response.content.strip()

if __name__ == "__main__":
    pdf_file = "sample.pdf"
    extracted_text = extract_text_from_pdf(pdf_file)
    if not extracted_text.strip():
        print("No extractable text found in the PDF.")
    else:
        summary = summarize_text_claude(extracted_text)
        print("Summary:\n", summary)

Performance

Latency~1.2s for gpt-4o summarization of 1000 tokens

Cost~$0.0025 per 500 tokens for gpt-4o

Rate limitsTier 1: 350 RPM / 20K TPM

Extract and summarize only key sections to reduce tokens.
Use max_tokens parameter to limit summary length.
Preprocess text to remove irrelevant whitespace or headers.

Approach	Latency	Cost/call	Best for
Standard OpenAI gpt-4o	~1.2s	~$0.0025	General PDF summarization
Streaming output	Similar latency, better UX	~$0.0025	Long documents with progressive display
Async calls	~1.2s concurrent	~$0.0025	Web servers and async workflows
Anthropic Claude	~1.5s	Check Anthropic pricing	Alternative summarization style

✓

Quick tip

Extract only the relevant text from PDFs before summarization to reduce token usage and improve summary quality.

⚠

Common mistake

Sending raw PDF binary or unextracted text directly to the model instead of extracting and cleaning text first.

Verified 2026-04 · gpt-4o, claude-3-5-sonnet-20241022

Verify ↗