Article Categories
- All Categories
-
Data Structure
-
Networking
-
RDBMS
-
Operating System
-
Java
-
MS Excel
-
iOS
-
HTML
-
CSS
-
Android
-
Python
-
C Programming
-
C++
-
C#
-
MongoDB
-
MySQL
-
Javascript
-
PHP
-
Economics & Finance
How to scan for a string in multiple document formats (CSV, Text, MS Word) with Python?
Searching for strings across multiple document formats is a common task in data processing and content management. Python provides excellent libraries to handle CSV, text, and MS Word documents efficiently.
Required Packages
Install the following packages before starting ?
pip install beautifulsoup4 python-docx
CSV File Search Function
The CSV search function uses the csv.reader module to iterate through rows and columns ?
import csv
def csv_stringsearch(input_file, input_string):
"""
Function: search a string in csv files.
args: input file, input string
"""
with open(input_file) as file:
for row in csv.reader(file):
for column in row:
if input_string in column.lower():
return True
return False
# Example usage
result = csv_stringsearch('sample.csv', 'hello')
print(f"String found in CSV: {result}")
Text File Search Function
Text file searching requires handling different encodings. We use UnicodeDammit to detect encoding automatically ?
from bs4 import UnicodeDammit
def text_stringsearch(input_file, input_string):
"""
Function: search a string in text files.
args: input file, input string
"""
# Detect encoding
with open(input_file, 'rb') as file:
content = file.read(1024)
guessencoding = UnicodeDammit(content)
encoding = guessencoding.original_encoding
# Search in file
with open(input_file, encoding=encoding) as file:
for line in file:
if input_string in line.lower():
return True
return False
# Example usage
result = text_stringsearch('sample.txt', 'hello')
print(f"String found in text file: {result}")
MS Word Document Search Function
For MS Word documents, we use the python-docx library to read paragraphs ?
import docx
def MSDocx_stringsearch(input_file, input_string):
"""
Function: search a string in MS Word documents.
args: input file, input string
"""
doc = docx.Document(input_file)
for paragraph in doc.paragraphs:
if input_string in paragraph.text.lower():
return True
return False
# Example usage
result = MSDocx_stringsearch('sample.docx', 'hello')
print(f"String found in Word doc: {result}")
Complete Search Implementation
Here's the complete script that searches across all supported file formats ?
import os
import csv
import docx
from bs4 import UnicodeDammit
def csv_stringsearch(input_file, input_string):
with open(input_file) as file:
for row in csv.reader(file):
for column in row:
if input_string in column.lower():
return True
return False
def text_stringsearch(input_file, input_string):
with open(input_file, 'rb') as file:
content = file.read(1024)
guessencoding = UnicodeDammit(content)
encoding = guessencoding.original_encoding
with open(input_file, encoding=encoding) as file:
for line in file:
if input_string in line.lower():
return True
return False
def MSDocx_stringsearch(input_file, input_string):
doc = docx.Document(input_file)
for paragraph in doc.paragraphs:
if input_string in paragraph.text.lower():
return True
return False
# Map file extensions to search functions
function_mapping = {
'csv': csv_stringsearch,
'txt': text_stringsearch,
'docx': MSDocx_stringsearch,
}
def main(input_string):
"""
Function: Search for a string in all supported files in current directory
args: input string
"""
for root, dirs, files in os.walk('.'):
for file in files:
# Get file extension
extension = file.split('.')[-1].lower()
if extension in function_mapping:
search_function = function_mapping.get(extension)
full_file_path = os.path.join(root, file)
try:
if search_function(full_file_path, input_string):
print(f'*** String found in {full_file_path}')
except Exception as e:
print(f'Error processing {full_file_path}: {e}')
# Execute search
if __name__ == '__main__':
string_to_search = 'hello'
print(f'Searching for: "{string_to_search}"\n')
main(string_to_search.lower())
Searching for: "hello" *** String found in .\sample.txt *** String found in .\data.csv *** String found in .\document.docx
Command Line Version
To make the script accept command line arguments, add this modification ?
import argparse
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Search for strings in multiple file formats')
parser.add_argument('-s', '--string', type=str, help='String to search for', default='hello')
args = parser.parse_args()
print(f'Searching for: "{args.string}"\n')
main(args.string.lower())
Run from command line as: python search_script.py -s "your search term"
Conclusion
This multi-format string search solution efficiently handles CSV, text, and Word documents using appropriate Python libraries. The modular design makes it easy to extend support for additional file formats by adding new search functions to the mapping dictionary.
