Adding Space between Potential Words using Python

When working with text data processing, it is common to encounter strings where potential words are merged together without spaces. This issue can arise from OCR errors, missing delimiters during data extraction, or other data-related problems. In this article, we will explore how to add spaces between potential words using Python and spaCy.

Basic Approach with spaCy

We will use spaCy, a popular Python library for natural language processing, which provides tokenization, named entity recognition, and part-of-speech tagging capabilities ?

Installation

First, install the spaCy library and download the English language model ?

pip install spacy
python -m spacy download en_core_web_sm

Basic Implementation

import spacy

def add_spaces_basic(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    words = []
    
    for token in doc:
        if not token.is_space:
            words.append(token.text)
        else:
            words.append(' ')
    
    return ' '.join([word for word in words if word != ' '])

# Example usage
input_text = "Thisisatestsentencewithnospaces"
output_text = add_spaces_basic(input_text)
print(f"Input: {input_text}")
print(f"Output: {output_text}")
Input: Thisisatestsentencewithnospaces
Output: This is a test sentence with no spaces

Handling Punctuation

When adding spaces between words, we need to handle punctuation marks properly to ensure they don't disrupt word separation ?

import spacy
import string

def add_spaces_with_punctuation(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    result = []
    
    for token in doc:
        if not token.is_space:
            # Add space before punctuation if needed
            if token.text in string.punctuation and result and result[-1] != ' ':
                result.append(' ')
            
            result.append(token.text)
            
            # Add space after punctuation if needed
            if token.text in string.punctuation:
                result.append(' ')
    
    return ''.join(result).strip()

# Example with punctuation
text_with_punct = "Hello,worldhowaredoing?"
output = add_spaces_with_punctuation(text_with_punct)
print(f"Input: {text_with_punct}")
print(f"Output: {output}")
Input: Hello,worldhowaredoing?
Output: Hello , world how are doing ?

Handling Numbers and Mixed Content

For text containing numbers and mixed content, we need a more sophisticated approach ?

import spacy
import re

def add_spaces_advanced(text):
    nlp = spacy.load('en_core_web_sm')
    
    # Pre-process: add spaces before numbers and capital letters
    text = re.sub(r'(\d+)', r' \1 ', text)
    text = re.sub(r'([A-Z])', r' \1', text)
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_space]
    
    return ' '.join(tokens).strip()

# Examples with different types of content
examples = [
    "HelloWorld123TestCase",
    "Python3ProgrammingLanguage",
    "MachineLearning2023Tutorial"
]

for example in examples:
    result = add_spaces_advanced(example)
    print(f"Input: {example}")
    print(f"Output: {result}")
    print()
Input: HelloWorld123TestCase
Output: Hello World 123 Test Case

Input: Python3ProgrammingLanguage
Output: Python 3 Programming Language

Input: MachineLearning2023Tutorial
Output: Machine Learning 2023 Tutorial

Complete Solution with Error Handling

Here's a comprehensive solution that combines all approaches with proper error handling ?

import spacy
import re
import string

class WordSeparator:
    def __init__(self):
        try:
            self.nlp = spacy.load('en_core_web_sm')
        except OSError:
            print("spaCy model not found. Please install it using:")
            print("python -m spacy download en_core_web_sm")
            raise
    
    def separate_words(self, text):
        if not text:
            return ""
        
        # Step 1: Handle camelCase and numbers
        processed_text = self._preprocess(text)
        
        # Step 2: Use spaCy for tokenization
        doc = self.nlp(processed_text)
        tokens = [token.text for token in doc if not token.is_space and token.text.strip()]
        
        # Step 3: Clean and join
        return self._postprocess(tokens)
    
    def _preprocess(self, text):
        # Add space before capital letters (except at start)
        text = re.sub(r'(?<!^)([A-Z])', r' \1', text)
        # Add space around numbers
        text = re.sub(r'(\d+)', r' \1 ', text)
        # Add space around punctuation
        text = re.sub(f'([{re.escape(string.punctuation)}])', r' \1 ', text)
        # Clean multiple spaces
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    
    def _postprocess(self, tokens):
        result = []
        for i, token in enumerate(tokens):
            if token in string.punctuation and i > 0:
                # Don't add space before punctuation
                result[-1] += token
            else:
                result.append(token)
        return ' '.join(result)

# Usage example
separator = WordSeparator()
test_cases = [
    "HelloWorldPython",
    "TestCase123Example",
    "spaCyNLPLibrary2023",
    "HTMLCSSJavaScript"
]

for test in test_cases:
    separated = separator.separate_words(test)
    print(f"'{test}' ? '{separated}'")
'HelloWorldPython' ? 'Hello World Python'
'TestCase123Example' ? 'Test Case 123 Example'
'spaCyNLPLibrary2023' ? 'spa Cy NLP Library 2023'
'HTMLCSSJavaScript' ? 'HTML CSS Java Script'

Comparison of Methods

Method Accuracy Complexity Best For
Basic spaCy Medium Low Simple text separation
Regex + spaCy High Medium Mixed content with numbers
Complete Solution Highest High Production use cases

Conclusion

Adding spaces between concatenated words requires combining regex preprocessing with NLP tokenization. Use the complete solution for production environments, while simpler methods work well for basic text separation tasks.

Updated on: 2026-03-27T12:34:52+05:30

1K+ Views

Kickstart Your Career

Get certified by completing the course

Get Started
Advertisements