How to scan for a string in multiple document formats (CSV, Text, MS Word) with Python?

Searching for strings across multiple document formats is a common task in data processing and content management. Python provides excellent libraries to handle CSV, text, and MS Word documents efficiently.

Required Packages

Install the following packages before starting ?

pip install beautifulsoup4 python-docx

CSV File Search Function

The CSV search function uses the csv.reader module to iterate through rows and columns ?

import csv

def csv_stringsearch(input_file, input_string):
    """
    Function: search a string in csv files.
    args: input file, input string
    """
    with open(input_file) as file:
        for row in csv.reader(file):
            for column in row:
                if input_string in column.lower():
                    return True
    return False

# Example usage
result = csv_stringsearch('sample.csv', 'hello')
print(f"String found in CSV: {result}")

Text File Search Function

Text file searching requires handling different encodings. We use UnicodeDammit to detect encoding automatically ?

from bs4 import UnicodeDammit

def text_stringsearch(input_file, input_string):
    """
    Function: search a string in text files.
    args: input file, input string
    """
    # Detect encoding
    with open(input_file, 'rb') as file:
        content = file.read(1024)
    
    guessencoding = UnicodeDammit(content)
    encoding = guessencoding.original_encoding
    
    # Search in file
    with open(input_file, encoding=encoding) as file:
        for line in file:
            if input_string in line.lower():
                return True
    
    return False

# Example usage  
result = text_stringsearch('sample.txt', 'hello')
print(f"String found in text file: {result}")

MS Word Document Search Function

For MS Word documents, we use the python-docx library to read paragraphs ?

import docx

def MSDocx_stringsearch(input_file, input_string):
    """
    Function: search a string in MS Word documents.
    args: input file, input string
    """
    doc = docx.Document(input_file)
    for paragraph in doc.paragraphs:
        if input_string in paragraph.text.lower():
            return True
    return False

# Example usage
result = MSDocx_stringsearch('sample.docx', 'hello')
print(f"String found in Word doc: {result}")

Complete Search Implementation

Here's the complete script that searches across all supported file formats ?

import os
import csv
import docx
from bs4 import UnicodeDammit

def csv_stringsearch(input_file, input_string):
    with open(input_file) as file:
        for row in csv.reader(file):
            for column in row:
                if input_string in column.lower():
                    return True
    return False

def text_stringsearch(input_file, input_string):
    with open(input_file, 'rb') as file:
        content = file.read(1024)
    
    guessencoding = UnicodeDammit(content)
    encoding = guessencoding.original_encoding
    
    with open(input_file, encoding=encoding) as file:
        for line in file:
            if input_string in line.lower():
                return True
    return False

def MSDocx_stringsearch(input_file, input_string):
    doc = docx.Document(input_file)
    for paragraph in doc.paragraphs:
        if input_string in paragraph.text.lower():
            return True
    return False

# Map file extensions to search functions
function_mapping = {
    'csv': csv_stringsearch,
    'txt': text_stringsearch,
    'docx': MSDocx_stringsearch,
}

def main(input_string):
    """
    Function: Search for a string in all supported files in current directory
    args: input string
    """
    for root, dirs, files in os.walk('.'):
        for file in files:
            # Get file extension
            extension = file.split('.')[-1].lower()
            
            if extension in function_mapping:
                search_function = function_mapping.get(extension)
                full_file_path = os.path.join(root, file)
                
                try:
                    if search_function(full_file_path, input_string):
                        print(f'*** String found in {full_file_path}')
                except Exception as e:
                    print(f'Error processing {full_file_path}: {e}')

# Execute search
if __name__ == '__main__':
    string_to_search = 'hello'
    print(f'Searching for: "{string_to_search}"\n')
    main(string_to_search.lower())
Searching for: "hello"

*** String found in .\sample.txt
*** String found in .\data.csv
*** String found in .\document.docx

Command Line Version

To make the script accept command line arguments, add this modification ?

import argparse

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Search for strings in multiple file formats')
    parser.add_argument('-s', '--string', type=str, help='String to search for', default='hello')
    args = parser.parse_args()
    
    print(f'Searching for: "{args.string}"\n')
    main(args.string.lower())

Run from command line as: python search_script.py -s "your search term"

Conclusion

This multi-format string search solution efficiently handles CSV, text, and Word documents using appropriate Python libraries. The modular design makes it easy to extend support for additional file formats by adding new search functions to the mapping dictionary.

Updated on: 2026-03-25T12:00:37+05:30

915 Views

Kickstart Your Career

Get certified by completing the course

Get Started
Advertisements