Developing a Text Search Engine using the Whoosh Library in Python

Whoosh is a Python library for indexing text and searching through documents efficiently. It's particularly useful when building applications that need to find similarities, extract data based on conditions, or count occurrences of specific terms in documents like research papers.

Installation

First, install the Whoosh library using pip ?

pip install whoosh

Setting Up the Search Engine

Let's create a complete text search engine step by step. First, import the required modules and create a directory for storing the index ?

from whoosh.fields import Schema, TEXT, ID
from whoosh import index
from whoosh.qparser import QueryParser
import os

# Create directory for index (skip if exists)
if not os.path.exists("search_index"):
    os.mkdir("search_index")

Defining the Schema

A schema specifies the fields of documents in an index. Here we define title, path, and content fields ?

from whoosh.fields import Schema, TEXT, ID
from whoosh import index
import os

# Create directory for index (skip if exists)
if not os.path.exists("search_index"):
    os.mkdir("search_index")

# Define schema with fields
schema = Schema(
    title=TEXT(stored=True),
    path=ID(stored=True), 
    content=TEXT(stored=True)
)

# Create index
search_index = index.create_in("search_index", schema)

Adding Documents to Index

Now let's add multiple documents to our search index ?

from whoosh.fields import Schema, TEXT, ID
from whoosh import index
import os

# Create directory and schema
if not os.path.exists("search_index"):
    os.mkdir("search_index")

schema = Schema(
    title=TEXT(stored=True),
    path=ID(stored=True), 
    content=TEXT(stored=True)
)

search_index = index.create_in("search_index", schema)

# Add documents to index
writer = search_index.writer()
writer.add_document(
    title="Python Tutorial",
    content="Python is a powerful programming language for data science",
    path="/docs/python"
)
writer.add_document(
    title="Machine Learning Guide", 
    content="Machine learning uses Python for data analysis and modeling",
    path="/docs/ml"
)
writer.add_document(
    title="Web Development",
    content="Python Django framework for web development projects",
    path="/docs/web"
)
writer.commit()

print("Documents indexed successfully!")
Documents indexed successfully!

Searching the Index

Now we can search through our indexed documents ?

from whoosh.fields import Schema, TEXT, ID
from whoosh import index
from whoosh.qparser import QueryParser
import os

# Create and populate index (same as above)
if not os.path.exists("search_index"):
    os.mkdir("search_index")

schema = Schema(
    title=TEXT(stored=True),
    path=ID(stored=True), 
    content=TEXT(stored=True)
)

search_index = index.create_in("search_index", schema)
writer = search_index.writer()
writer.add_document(
    title="Python Tutorial",
    content="Python is a powerful programming language for data science",
    path="/docs/python"
)
writer.add_document(
    title="Machine Learning Guide", 
    content="Machine learning uses Python for data analysis and modeling",
    path="/docs/ml"
)
writer.commit()

# Search the index
with search_index.searcher() as searcher:
    query = QueryParser("content", search_index.schema).parse("Python data")
    results = searcher.search(query, terms=True)
    
    print(f"Found {len(results)} results:")
    for result in results:
        print(f"Title: {result['title']}")
        print(f"Content: {result['content']}")
        print(f"Score: {result.score:.2f}")
        print("---")
        
    # Show matched terms
    if results.has_matched_terms():
        print(f"Matched terms: {results.matched_terms()}")
Found 2 results:
Title: Python Tutorial
Content: Python is a powerful programming language for data science
Score: 1.79
---
Title: Machine Learning Guide
Content: Machine learning uses Python for data analysis and modeling
Score: 1.17
---
Matched terms: {('content', b'python'), ('content', b'data')}

Advanced Search Features

You can perform more complex searches using different query types ?

from whoosh.fields import Schema, TEXT, ID
from whoosh import index
from whoosh.qparser import QueryParser
import os

# Setup (same index as before)
if not os.path.exists("search_index"):
    os.mkdir("search_index")

schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True))
search_index = index.create_in("search_index", schema)
writer = search_index.writer()
writer.add_document(title="Python Basics", content="Learn Python programming fundamentals", path="/docs/basics")
writer.add_document(title="Advanced Python", content="Advanced Python techniques and patterns", path="/docs/advanced")
writer.commit()

# Different search queries
queries = [
    "Python",           # Single term
    "Python AND Advanced",  # Boolean AND
    "title:Python",     # Search in specific field
]

with search_index.searcher() as searcher:
    for query_string in queries:
        print(f"Query: '{query_string}'")
        query = QueryParser("content", search_index.schema).parse(query_string)
        results = searcher.search(query)
        
        for result in results:
            print(f"  - {result['title']}: {result.score:.2f}")
        print()
Query: 'Python'
  - Advanced Python: 2.40
  - Python Basics: 2.40

Query: 'Python AND Advanced'
  - Advanced Python: 4.80

Query: 'title:Python'
  - Advanced Python: 2.40
  - Python Basics: 2.40

Key Features

Feature Description Use Case
TEXT Field Full-text searchable Document content, descriptions
ID Field Unique identifier File paths, document IDs
Boolean Queries AND, OR, NOT operations Complex search conditions
Field-specific Search Search within specific fields Title-only or content-only searches

Conclusion

Whoosh provides a powerful and lightweight solution for building text search engines in Python. With its schema-based indexing and flexible query parser, you can quickly search through large document collections and extract relevant information with scoring and term matching capabilities.

---
Updated on: 2026-03-27T14:12:46+05:30

2K+ Views

Kickstart Your Career

Get certified by completing the course

Get Started
Advertisements