Article Categories

Selected Reading

Downloading PDFs with Python using Requests and BeautifulSoup

Python Server Side Programming Programming

Requests and BeautifulSoup are Python libraries that can download PDFs from web pages. The requests library sends HTTP requests and receives responses, while BeautifulSoup parses HTML to extract PDF links. In this article, we will understand how to download PDFs using these libraries in Python.

Installing Dependencies

Before using the BeautifulSoup and Requests libraries, we need to install them using pip commands

pip install requests
pip install beautifulsoup4

Method 1: Direct PDF Download

If you have a direct PDF URL, you can download it directly using requests

import requests

def download_pdf_direct(url, filename):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises an HTTPError for bad responses
        
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f'PDF downloaded successfully as {filename}')
    except requests.exceptions.RequestException as e:
        print(f'Error downloading PDF: {e}')

# Example usage
pdf_url = 'https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf'
download_pdf_direct(pdf_url, 'sample.pdf')

PDF downloaded successfully as sample.pdf

Method 2: Extracting PDF Links from Web Pages

When PDF links are embedded in HTML pages, use BeautifulSoup to extract them

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def find_and_download_pdfs(base_url, download_folder='.'):
    try:
        # Fetch the webpage
        response = requests.get(base_url)
        response.raise_for_status()
        
        # Parse HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all links ending with .pdf
        pdf_links = soup.find_all('a', href=True)
        pdf_urls = []
        
        for link in pdf_links:
            href = link['href']
            if href.endswith('.pdf'):
                # Convert relative URLs to absolute URLs
                full_url = urljoin(base_url, href)
                pdf_urls.append(full_url)
        
        print(f'Found {len(pdf_urls)} PDF links')
        
        # Download each PDF
        for i, pdf_url in enumerate(pdf_urls):
            filename = f'document_{i+1}.pdf'
            download_pdf_direct(pdf_url, filename)
            
    except Exception as e:
        print(f'Error: {e}')

# Example: This would work with a real webpage containing PDF links
# find_and_download_pdfs('https://example.com/resources')
print('Function defined successfully. Use with a real webpage containing PDF links.')

Function defined successfully. Use with a real webpage containing PDF links.

Method 3: Handling Different PDF Link Patterns

Some websites use different patterns for PDF links. Here's a more robust approach

import requests
from bs4 import BeautifulSoup
import os

def advanced_pdf_extractor(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Different ways to find PDF links
        pdf_links = []
        
        # Method 1: Links with .pdf extension
        for link in soup.find_all('a', href=True):
            if '.pdf' in link['href'].lower():
                pdf_links.append(link['href'])
        
        # Method 2: Links with 'pdf' in text content
        for link in soup.find_all('a', string=lambda text: text and 'pdf' in text.lower()):
            if link.get('href'):
                pdf_links.append(link['href'])
        
        # Method 3: Input elements with PDF file types
        for input_elem in soup.find_all('input', {'type': 'file', 'accept': lambda x: x and 'pdf' in x}):
            if input_elem.get('value'):
                pdf_links.append(input_elem['value'])
        
        # Remove duplicates and return
        return list(set(pdf_links))
        
    except Exception as e:
        print(f'Error extracting PDF links: {e}')
        return []

# Example function call
links = advanced_pdf_extractor('https://httpbin.org/html')
print(f'Found {len(links)} potential PDF links')

Found 0 potential PDF links

Best Practices

When downloading PDFs programmatically, follow these guidelines

Check file size Large files may require streaming downloads
Handle errors gracefully Use tryexcept blocks for network issues
Respect robots.txt Check website policies before scraping
Add delays Avoid overwhelming servers with rapid requests
Verify content type Ensure the response is actually a PDF

Error Handling Example

import requests
from urllib.parse import urlparse

def safe_pdf_download(url, filename=None):
    try:
        # Send HEAD request first to check content type
        head_response = requests.head(url)
        content_type = head_response.headers.get('content-type', '')
        
        if 'pdf' not in content_type.lower():
            print(f'Warning: Content type is {content_type}, not PDF')
        
        # Download the file
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        # Generate filename if not provided
        if not filename:
            parsed_url = urlparse(url)
            filename = parsed_url.path.split('/')[-1] or 'downloaded.pdf'
            if not filename.endswith('.pdf'):
                filename += '.pdf'
        
        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        
        print(f'Successfully downloaded: {filename}')
        return True
        
    except requests.exceptions.RequestException as e:
        print(f'Network error: {e}')
        return False
    except Exception as e:
        print(f'Unexpected error: {e}')
        return False

# Test the function
url = 'https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf'
safe_pdf_download(url, 'test_download.pdf')

Successfully downloaded: test_download.pdf

Conclusion

Python's requests and BeautifulSoup libraries provide powerful tools for downloading PDFs from the web. Use direct downloads for known URLs, and combine both libraries to extract and download PDFs from HTML pages with proper error handling.

Rohan Singh

Updated on: 2026-03-27T07:14:35+05:30

3K+ Views

Previous Next