Article Categories
- All Categories
-
Data Structure
-
Networking
-
RDBMS
-
Operating System
-
Java
-
MS Excel
-
iOS
-
HTML
-
CSS
-
Android
-
Python
-
C Programming
-
C++
-
C#
-
MongoDB
-
MySQL
-
Javascript
-
PHP
-
Economics & Finance
How to save HTML Tables data to CSV in Python
Extracting data from HTML tables and saving it to CSV format is a common task in data science and web scraping. When dealing with multiple tables on a webpage, manual copy-paste becomes impractical, making automated extraction essential.
This tutorial demonstrates how to use Python libraries like csv, urllib, and BeautifulSoup to scrape HTML table data and convert it to CSV format efficiently.
Required Libraries
Before starting, ensure you have the necessary libraries installed −
pip install beautifulsoup4
The csv and urllib modules are part of Python's standard library.
Creating a Simple CSV File
Let's start with the basics of creating a CSV file using Python's built-in csv module −
import csv
# Open File in Write mode, if not found it will create one
File = open('test.csv', 'w+')
Data = csv.writer(File)
# My Header
Data.writerow(('Column1', 'Column2', 'Column3'))
# Write data
for i in range(5):
Data.writerow((i, i+1, i+2))
# Close the file
File.close()
print("CSV file 'test.csv' created successfully!")
The output of the above code creates a test.csv file in the same directory −
CSV file 'test.csv' created successfully! Contents of test.csv: Column1,Column2,Column3 0,1,2 1,2,3 2,3,4 3,4,5 4,5,6
Scraping HTML Table Data
Now let's extract HTML table data from a webpage and save it to CSV format. We'll use BeautifulSoup to parse HTML and extract table content.
Step 1: Import Required Modules
import csv from urllib.request import urlopen from bs4 import BeautifulSoup
Step 2: Fetch and Parse HTML
url = 'https://www.tutorialspoint.com/python/python_dictionary.htm' html = urlopen(url) soup = BeautifulSoup(html, 'html.parser')
Step 3: Extract Table Data
# Find all tables and select the first one
table = soup.find_all('table')[0]
rows = table.find_all('tr')
Step 4: Process and Save Data
# Open CSV file for writing
File = open('html_table_data.csv', 'w', newline='')
Data = csv.writer(File)
try:
for row in rows:
FilteredRow = []
for cell in row.find_all(['td', 'th']):
# Extract text and clean whitespace
FilteredRow.append(cell.get_text().strip())
Data.writerow(FilteredRow)
finally:
File.close()
Complete Example
Here's the complete script that extracts HTML table data and saves it to a CSV file −
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
def scrape_table_to_csv(url, output_filename, table_index=0):
"""
Scrape HTML table data and save to CSV file
Args:
url: URL of the webpage containing the table
output_filename: Name of the output CSV file
table_index: Index of the table to extract (default: 0)
"""
try:
# Open the URL and parse the HTML
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
# Extract the specified table
tables = soup.find_all('table')
if not tables:
print("No tables found on the webpage")
return
if table_index >= len(tables):
print(f"Table index {table_index} not found. Only {len(tables)} table(s) available.")
return
table = tables[table_index]
rows = table.find_all('tr')
# Write the content to CSV file
with open(output_filename, 'w', newline='', encoding='utf-8') as File:
Data = csv.writer(File)
for row in rows:
FilteredRow = []
for cell in row.find_all(['td', 'th']):
# Extract text content and clean whitespace
cell_text = cell.get_text().strip().replace('\n', ' ')
FilteredRow.append(cell_text)
# Only write non-empty rows
if FilteredRow:
Data.writerow(FilteredRow)
print(f"Table data successfully saved to '{output_filename}'")
print(f"Total rows extracted: {len(rows)}")
except Exception as e:
print(f"Error occurred: {e}")
# Example usage
url = 'https://www.tutorialspoint.com/python/python_basic_syntax.htm'
scrape_table_to_csv(url, 'python_syntax_table.csv', 0)
The output shows successful extraction of table data −
Table data successfully saved to 'python_syntax_table.csv' Total rows extracted: 5
Extracting Multiple Tables
To extract all tables from a webpage, you can modify the script to loop through all tables −
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
def scrape_all_tables_to_csv(url, base_filename):
"""
Scrape all HTML tables from a webpage and save each to separate CSV files
"""
try:
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
tables = soup.find_all('table')
if not tables:
print("No tables found on the webpage")
return
for i, table in enumerate(tables):
rows = table.find_all('tr')
filename = f"{base_filename}_table_{i+1}.csv"
with open(filename, 'w', newline='', encoding='utf-8') as File:
Data = csv.writer(File)
for row in rows:
FilteredRow = []
for cell in row.find_all(['td', 'th']):
cell_text = cell.get_text().strip().replace('\n', ' ')
FilteredRow.append(cell_text)
if FilteredRow:
Data.writerow(FilteredRow)
print(f"Table {i+1} saved to '{filename}' ({len(rows)} rows)")
print(f"Total tables extracted: {len(tables)}")
except Exception as e:
print(f"Error occurred: {e}")
# Example usage
url = 'https://www.tutorialspoint.com/python/python_basic_operators.htm'
scrape_all_tables_to_csv(url, 'operators')
This script creates separate CSV files for each table found on the webpage −
Table 1 saved to 'operators_table_1.csv' (8 rows) Table 2 saved to 'operators_table_2.csv' (6 rows) Table 3 saved to 'operators_table_3.csv' (4 rows) Total tables extracted: 3
Error Handling and Best Practices
When scraping web data, it's important to implement proper error handling and follow best practices −
import csv
import time
from urllib.request import urlopen
from urllib.error import URLError, HTTPError
from bs4 import BeautifulSoup
def robust_table_scraper(url, output_filename, table_index=0, delay=1):
"""
Robust table scraper with error handling and rate limiting
"""
try:
# Add delay to be respectful to the server
time.sleep(delay)
# Open URL with error handling
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
# Find tables
tables = soup.find_all('table')
if not tables:
raise ValueError("No tables found on the webpage")
if table_index >= len(tables):
raise IndexError(f"Table index {table_index} out of range. Found {len(tables)} tables.")
table = tables[table_index]
rows = table.find_all('tr')
# Save to CSV with proper encoding
with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
for row in rows:
row_data = []
for cell in row.find_all(['td', 'th']):
# Clean the text content
text = cell.get_text()
text = ' '.join(text.split()) # Remove extra whitespace
row_data.append(text)
if row_data: # Only write non-empty rows
writer.writerow(row_data)
return f"Successfully saved {len(rows)} rows to {output_filename}"
except (URLError, HTTPError) as e:
return f"Network error: {e}"
except Exception as e:
return f"Error: {e}"
# Example usage with error handling
result = robust_table_scraper(
'https://www.tutorialspoint.com/python/python_dictionary.htm',
'dictionary_methods.csv'
)
print(result)
The enhanced scraper includes proper error handling and rate limiting −
Successfully saved 5 rows to dictionary_methods.csv
Key Features
The HTML table to CSV conversion process offers several advantages −
Automation − Eliminates manual copy-paste for multiple tables
Data cleaning − Automatically removes HTML tags and extra whitespace
