How to convert speech to text using JavaScript?

Overview

To convert spoken words to text, we use the Web Speech API's SpeechRecognition component. The SpeechRecognition interface recognizes spoken audio and converts it to text. The spoken words are processed and displayed as text in HTML elements on the browser screen.

Syntax

The basic syntax for creating a speech recognition instance:

let recognition = new webkitSpeechRecognition();
// or
let recognition = new SpeechRecognition();

Note: webkitSpeechRecognition() is used for Chrome and Safari browsers, while SpeechRecognition() is the standard implementation for other browsers.

Browser Compatibility Check

Before implementing speech recognition, it's important to check browser support:

function checkBrowserSupport() {
    if ('webkitSpeechRecognition' in window || 'SpeechRecognition' in window) {
        console.log('Speech recognition supported');
        return true;
    } else {
        console.log('Speech recognition not supported');
        return false;
    }
}

console.log('Browser support:', checkBrowserSupport());

Basic Implementation

Here's a complete example that demonstrates speech-to-text conversion:

<!DOCTYPE html>
<html>
<head>
    <title>Speech to Text Converter</title>
    <style>
        .container {
            max-width: 500px;
            margin: 50px auto;
            padding: 20px;
            border: 1px solid #ddd;
            border-radius: 8px;
        }
        .btn {
            background: #007bff;
            color: white;
            border: none;
            padding: 10px 20px;
            border-radius: 5px;
            cursor: pointer;
            margin: 10px 0;
        }
        .btn:hover {
            background: #0056b3;
        }
        .status {
            color: #28a745;
            font-weight: bold;
            margin: 10px 0;
        }
        .result {
            background: #f8f9fa;
            padding: 15px;
            border-radius: 5px;
            margin: 10px 0;
            min-height: 50px;
        }
    </style>
</head>
<body>
    <div class="container">
        <h1>Speech to Text Converter</h1>
        <button id="startBtn" class="btn">? Start Speaking</button>
        <button id="stopBtn" class="btn" disabled>?? Stop</button>
        <div id="status" class="status"></div>
        <div id="result" class="result">
            Click "Start Speaking" and speak into your microphone...
        </div>
    </div>

    <script>
        class SpeechToText {
            constructor() {
                this.recognition = null;
                this.isListening = false;
                this.initializeRecognition();
                this.setupEventListeners();
            }

            initializeRecognition() {
                // Check for browser support
                const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
                
                if (!SpeechRecognition) {
                    document.getElementById('result').innerHTML = 
                        '? Speech recognition not supported in this browser.';
                    return;
                }

                this.recognition = new SpeechRecognition();
                
                // Configure recognition settings
                this.recognition.continuous = true;
                this.recognition.interimResults = true;
                this.recognition.lang = 'en-US';
                
                this.setupRecognitionEvents();
            }

            setupRecognitionEvents() {
                this.recognition.onstart = () => {
                    this.isListening = true;
                    document.getElementById('status').textContent = '? Listening...';
                    document.getElementById('startBtn').disabled = true;
                    document.getElementById('stopBtn').disabled = false;
                };

                this.recognition.onresult = (event) => {
                    let finalTranscript = '';
                    let interimTranscript = '';

                    for (let i = event.resultIndex; i < event.results.length; i++) {
                        const transcript = event.results[i][0].transcript;
                        
                        if (event.results[i].isFinal) {
                            finalTranscript += transcript;
                        } else {
                            interimTranscript += transcript;
                        }
                    }

                    document.getElementById('result').innerHTML = 
                        finalTranscript + '<span style="color: #666;">' + interimTranscript + '</span>';
                };

                this.recognition.onerror = (event) => {
                    console.error('Speech recognition error:', event.error);
                    document.getElementById('status').textContent = 
                        '? Error: ' + event.error;
                    this.stopListening();
                };

                this.recognition.onend = () => {
                    this.stopListening();
                };
            }

            setupEventListeners() {
                document.getElementById('startBtn').addEventListener('click', () => {
                    this.startListening();
                });

                document.getElementById('stopBtn').addEventListener('click', () => {
                    this.stopListening();
                });
            }

            startListening() {
                if (this.recognition && !this.isListening) {
                    this.recognition.start();
                }
            }

            stopListening() {
                if (this.recognition && this.isListening) {
                    this.recognition.stop();
                }
                
                this.isListening = false;
                document.getElementById('status').textContent = '? Ready';
                document.getElementById('startBtn').disabled = false;
                document.getElementById('stopBtn').disabled = true;
            }
        }

        // Initialize the speech-to-text converter
        const speechToText = new SpeechToText();
    </script>
</body>
</html>

Key Configuration Options

The SpeechRecognition API provides several configuration options:

recognition.continuous = true;        // Keep listening until stopped
recognition.interimResults = true;    // Show partial results while speaking
recognition.lang = 'en-US';          // Set language (en-US, es-ES, fr-FR, etc.)
recognition.maxAlternatives = 1;     // Number of alternative results

Browser Support

Browser Support Implementation
Chrome ? Full webkitSpeechRecognition
Firefox ? Limited Requires flag
Safari ? Partial webkitSpeechRecognition
Edge ? Full SpeechRecognition

Common Use Cases

Speech-to-text functionality is valuable for:

  • Voice Commands - Control web applications with voice
  • Dictation - Convert speech to text for forms and documents
  • Accessibility - Help users with mobility limitations
  • Voice Search - Implement voice-based search functionality

Error Handling

Always implement proper error handling for speech recognition:

recognition.onerror = function(event) {
    switch(event.error) {
        case 'no-speech':
            console.log('No speech detected');
            break;
        case 'audio-capture':
            console.log('Microphone not available');
            break;
        case 'not-allowed':
            console.log('Permission denied');
            break;
        default:
            console.log('Speech recognition error:', event.error);
    }
};

Conclusion

The Web Speech API's SpeechRecognition interface provides powerful speech-to-text capabilities for modern web applications. While browser support varies, it works well in Chrome and Safari, making it suitable for many web-based voice interaction scenarios.

Updated on: 2026-03-15T23:19:01+05:30

19K+ Views

Kickstart Your Career

Get certified by completing the course

Get Started
Advertisements