C program to detect tokens in a C program

Here, we will create a C program to detect tokens in a C program. This is called the lexical analysis phase of the compiler. The lexical analyzer is the part of the compiler that detects the token of the program and sends it to the syntax analyzer.

Token is the smallest entity of the code. It can be a keyword, identifier, constant, string literal, or symbol.

Syntax

void detectTokens(char* sourceCode);

Types of Tokens

Examples of different types of tokens in C −

  • Keywords: for, if, include, while, int, etc.
  • Identifiers: variables, functions, etc.
  • Separators: ',', ';', '(', ')', etc.
  • Operators: '-', '=', '++', etc.
  • Constants: numbers like 123, 45.67

Example: C Program to Detect Tokens

This program analyzes a C code string and identifies different types of tokens −

#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

bool isValidDelimiter(char ch) {
    if (ch == ' ' || ch == '+' || ch == '-' || ch == '*' ||
        ch == '/' || ch == ',' || ch == ';' || ch == '>' ||
        ch == '<' || ch == '=' || ch == '(' || ch == ')' ||
        ch == '[' || ch == ']' || ch == '{' || ch == '}')
        return (true);
    return (false);
}

bool isValidOperator(char ch) {
    if (ch == '+' || ch == '-' || ch == '*' ||
        ch == '/' || ch == '>' || ch == '<' ||
        ch == '=')
        return (true);
    return (false);
}

bool isvalidIdentifier(char* str) {
    if (str[0] >= '0' && str[0] <= '9' || isValidDelimiter(str[0]) == true)
        return (false);
    return (true);
}

bool isValidKeyword(char* str) {
    if (!strcmp(str, "if") || !strcmp(str, "else") || !strcmp(str, "while") || 
        !strcmp(str, "do") || !strcmp(str, "break") || !strcmp(str, "continue") || 
        !strcmp(str, "int") || !strcmp(str, "double") || !strcmp(str, "float") || 
        !strcmp(str, "return") || !strcmp(str, "char") || !strcmp(str, "case") ||
        !strcmp(str, "sizeof") || !strcmp(str, "long") || !strcmp(str, "short") || 
        !strcmp(str, "typedef") || !strcmp(str, "switch") || !strcmp(str, "unsigned") ||
        !strcmp(str, "void") || !strcmp(str, "static") || !strcmp(str, "struct") || 
        !strcmp(str, "goto"))
        return (true);
    return (false);
}

bool isValidInteger(char* str) {
    int i, len = strlen(str);
    if (len == 0)
        return (false);
    for (i = 0; i < len; i++) {
        if (!(str[i] >= '0' && str[i] <= '9') || (str[i] == '-' && i > 0))
            return (false);
    }
    return (true);
}

bool isRealNumber(char* str) {
    int i, len = strlen(str);
    bool hasDecimal = false;
    if (len == 0)
        return (false);
    for (i = 0; i < len; i++) {
        if (!(str[i] >= '0' && str[i] <= '9') && str[i] != '.' || 
            (str[i] == '-' && i > 0))
            return (false);
        if (str[i] == '.')
            hasDecimal = true;
    }
    return (hasDecimal);
}

char* subString(char* str, int left, int right) {
    int i;
    char* subStr = (char*)malloc(sizeof(char) * (right - left + 2));
    for (i = left; i <= right; i++)
        subStr[i - left] = str[i];
    subStr[right - left + 1] = '\0';
    return (subStr);
}

void detectTokens(char* str) {
    int left = 0, right = 0;
    int length = strlen(str);
    
    while (right <= length && left <= right) {
        if (isValidDelimiter(str[right]) == false)
            right++;
            
        if (isValidDelimiter(str[right]) == true && left == right) {
            if (isValidOperator(str[right]) == true)
                printf("Valid operator : '%c'<br>", str[right]);
            right++;
            left = right;
        } 
        else if (isValidDelimiter(str[right]) == true && left != right || 
                (right == length && left != right)) {
            char* subStr = subString(str, left, right - 1);
            
            if (isValidKeyword(subStr) == true)
                printf("Valid keyword : '%s'<br>", subStr);
            else if (isValidInteger(subStr) == true)
                printf("Valid Integer : '%s'<br>", subStr);
            else if (isRealNumber(subStr) == true)
                printf("Real Number : '%s'<br>", subStr);
            else if (isvalidIdentifier(subStr) == true && 
                    isValidDelimiter(str[right - 1]) == false)
                printf("Valid Identifier : '%s'<br>", subStr);
            else if (isvalidIdentifier(subStr) == false && 
                    isValidDelimiter(str[right - 1]) == false)
                printf("Invalid Identifier : '%s'<br>", subStr);
                
            free(subStr);
            left = right;
        }
    }
}

int main() {
    char str[100] = "float x = a + 1b; ";
    printf("The Program is : '%s'<br>", str);
    printf("All Tokens are :<br>");
    detectTokens(str);
    return 0;
}
The Program is : 'float x = a + 1b; '
All Tokens are :
Valid keyword : 'float'
Valid Identifier : 'x'
Valid operator : '='
Valid Identifier : 'a'
Valid operator : '+'
Invalid Identifier : '1b'

How It Works

The program uses the following approach −

  • Scans the input string character by character using two pointers
  • Identifies delimiters (spaces, operators, punctuation) to separate tokens
  • Classifies each extracted token using validation functions
  • Checks for keywords, identifiers, integers, real numbers, and operators

Conclusion

This lexical analyzer demonstrates the basic token recognition phase of a compiler. It successfully identifies and classifies different types of tokens in C source code, which is the first step in the compilation process.

Updated on: 2026-03-15T12:51:38+05:30

31K+ Views

Kickstart Your Career

Get certified by completing the course

Get Started
Advertisements