How to summarize PDF with Python
Direct answer
Use Python libraries like PyPDF2 or pdfplumber to extract text from PDFs, then send the extracted text to an AI model like gpt-4o via the OpenAI Python SDK to generate a summary.
Setup
Install
pip install openai PyPDF2 Env vars
OPENAI_API_KEY Imports
import os
from openai import OpenAI
import PyPDF2 Examples
inA PDF with a 2-page article about climate change.
outSummary: The article discusses the causes and effects of climate change, emphasizing the role of greenhouse gases and the need for global action.
inA PDF containing a product manual of 10 pages.
outSummary: This manual provides step-by-step instructions for setting up and operating the product safely and efficiently.
inA scanned PDF with mostly images and little text.
outSummary: The PDF contains mostly images; text extraction was limited, so a detailed summary is not available.
Integration steps
- Install PyPDF2 and openai Python packages.
- Extract text from the PDF file using PyPDF2.
- Initialize the OpenAI client with OPENAI_API_KEY from environment variables.
- Send the extracted text as a prompt to the gpt-4o model for summarization.
- Parse and display the summary from the API response.
Full code
import os
from openai import OpenAI
import PyPDF2
# Initialize OpenAI client
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
# Function to extract text from PDF
def extract_text_from_pdf(pdf_path: str) -> str:
text = []
with open(pdf_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text.append(page_text)
return "\n".join(text)
# Summarize extracted text using OpenAI
def summarize_text(text: str) -> str:
prompt = f"Summarize the following text:\n\n{text}"
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
max_tokens=300
)
return response.choices[0].message.content.strip()
# Main execution
if __name__ == "__main__":
pdf_file = "sample.pdf" # Replace with your PDF file path
extracted_text = extract_text_from_pdf(pdf_file)
if not extracted_text.strip():
print("No extractable text found in the PDF.")
else:
summary = summarize_text(extracted_text)
print("Summary:\n", summary) output
Summary: The article discusses the causes and effects of climate change, emphasizing the role of greenhouse gases and the need for global action.
API trace
Request
{"model": "gpt-4o", "messages": [{"role": "user", "content": "Summarize the following text:\n\n<extracted PDF text>"}], "max_tokens": 300} Response
{"choices": [{"message": {"content": "<summary text>"}}], "usage": {"prompt_tokens": 500, "completion_tokens": 150, "total_tokens": 650}} Extract
response.choices[0].message.contentVariants
Streaming summary output ›
Use streaming to display the summary progressively for large PDF texts or better user experience.
import os
from openai import OpenAI
import PyPDF2
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
def extract_text_from_pdf(pdf_path: str) -> str:
text = []
with open(pdf_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text.append(page_text)
return "\n".join(text)
def summarize_text_stream(text: str):
prompt = f"Summarize the following text:\n\n{text}"
stream = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
max_tokens=300,
stream=True
)
for chunk in stream:
delta = chunk.choices[0].delta.content or ""
print(delta, end="", flush=True)
if __name__ == "__main__":
pdf_file = "sample.pdf"
extracted_text = extract_text_from_pdf(pdf_file)
if not extracted_text.strip():
print("No extractable text found in the PDF.")
else:
summarize_text_stream(extracted_text) Async PDF summarization ›
Use async version for integration in async web servers or concurrent workflows.
import os
import asyncio
from openai import OpenAI
import PyPDF2
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
def extract_text_from_pdf(pdf_path: str) -> str:
text = []
with open(pdf_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text.append(page_text)
return "\n".join(text)
async def summarize_text_async(text: str) -> str:
prompt = f"Summarize the following text:\n\n{text}"
response = await client.chat.completions.acreate(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
max_tokens=300
)
return response.choices[0].message.content.strip()
async def main():
pdf_file = "sample.pdf"
extracted_text = extract_text_from_pdf(pdf_file)
if not extracted_text.strip():
print("No extractable text found in the PDF.")
return
summary = await summarize_text_async(extracted_text)
print("Summary:\n", summary)
if __name__ == "__main__":
asyncio.run(main()) Use Anthropic Claude for summarization ›
Use Anthropic Claude models if you prefer their summarization style or have an Anthropic API key.
import os
import anthropic
client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
def extract_text_from_pdf(pdf_path: str) -> str:
import PyPDF2
text = []
with open(pdf_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text.append(page_text)
return "\n".join(text)
def summarize_text_claude(text: str) -> str:
system_prompt = "You are a helpful assistant that summarizes text concisely."
user_prompt = f"Summarize the following text:\n\n{text}"
response = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=300,
system=system_prompt,
messages=[{"role": "user", "content": user_prompt}]
)
return response.content.strip()
if __name__ == "__main__":
pdf_file = "sample.pdf"
extracted_text = extract_text_from_pdf(pdf_file)
if not extracted_text.strip():
print("No extractable text found in the PDF.")
else:
summary = summarize_text_claude(extracted_text)
print("Summary:\n", summary) Performance
Latency~1.2s for gpt-4o summarization of 1000 tokens
Cost~$0.0025 per 500 tokens for gpt-4o
Rate limitsTier 1: 350 RPM / 20K TPM
- Extract and summarize only key sections to reduce tokens.
- Use max_tokens parameter to limit summary length.
- Preprocess text to remove irrelevant whitespace or headers.
| Approach | Latency | Cost/call | Best for |
|---|---|---|---|
| Standard OpenAI gpt-4o | ~1.2s | ~$0.0025 | General PDF summarization |
| Streaming output | Similar latency, better UX | ~$0.0025 | Long documents with progressive display |
| Async calls | ~1.2s concurrent | ~$0.0025 | Web servers and async workflows |
| Anthropic Claude | ~1.5s | Check Anthropic pricing | Alternative summarization style |
Quick tip
Extract only the relevant text from PDFs before summarization to reduce token usage and improve summary quality.
Common mistake
Sending raw PDF binary or unextracted text directly to the model instead of extracting and cleaning text first.