Lexical analysis is step 1 of compiling code down to machine language. The process breaks source code down into a long list of pieces called tokens. This list of tokens is used by a parser algorithm that extracts meaning from the order and arrangement of the tokens. Here is a small example of lex analysis:

code:

int main(void) {

float myvar = 2.5;

return 0;

}

list of tokens:

  1. int type
  2. main reserved word
  3. (
  4. void keyword
  5. )
  6. {
  7. float keyword
  8. myvar identifier
  9. = operator
  10. 2.5 floating point constant
  11. ; end statement
  12. return keyword
  13. 0 integer constant
  14. ; end statemant
  15. }

As you can see, the list of tokens gets long rather quickly. Absolutely no syntax checking is done during lex. That happens later down the line.

I have written a basic lexical analyzer to break C++ code into tokens.  Its own source code is C++ as well.

// B.K. Turley
// 1/26/2011

// Compile with g++ only, The nesting is limited to 128 in VS2010.
// This limit is exceeded by the massive if..else..if..else statement
// in identify_token()

// Known bugs:
// block comments are wrongly terminated with / instead of */

// Tokens are also stored in a linked list for later processing

#include <iostream>  //  Include <iostream> whenever using C++ I/O (cin and cout)
#include <iomanip>   //  Include <iomanip> along with iostream and you can't go wrong
#include <fstream>   //  Include <fstream> whenever working with external files.
#include <string>    //  Include <string> whenever using variable of type string.
#include <sstream>
#include <list>

using namespace std;


int find_split_point(string word);
	//accepts a string, returns a integer representing the length of the first token in the string

void identify_token(string str);
	//accepts a string, pritnt the type of token (operator, reverved, meta ect.) followed by str.

list<string> tokens;
int main()
    {

    string filename = "";
	fstream in;
	while(!in.is_open()){
	cout << "Enter a valid C++ file file to tokenize  ";
        cin >> filename;
	in.open(filename.c_str());
	}

	std::stringstream buffer;
	buffer << in.rdbuf();

	string lexme(buffer.str());


    //begin lex
    while(lexme != ""){
		int splitpoint = find_split_point(lexme);// if splitpoint is set to anything but zero here, string begins with operator or META

		if(splitpoint==0)//   string doesn't begin with op or meta. so set splitpoint to the index to the first occurence of space, operator, or META
				splitpoint = lexme.find_first_of(" #"/=+-*%!^&|<>~,?.:;()[]{}t");

		if(splitpoint==lexme.length()){ // this only executes whenever the very last piece of the string contains no ops or META
						identify_token(lexme);
						lexme = "";
		}
		else{
				if(splitpoint==0) splitpoint++;
				string token = lexme.substr(0,splitpoint);
				identify_token(token);
				lexme = lexme.substr(splitpoint, lexme.length()-splitpoint);
		}
	}

    return 0;
}


int find_split_point(string word){
// tests if first character is an operator(or META) and returns its length.
/*
    examples:
	isfirstcharoperator("hello there") returns zero.
	isfirstcharoperator("++i;") returns 2.
	isfirstcharoperator("!!crazy!") returns 1. (there is no !! operator)
	isfirstcharoperator("#include <string>") returns 8. (#inclued is a meta word)

*/
	string::iterator itr=word.begin();

        if(itr != word.end() && *itr=='#'){
		int i = 0;
		while(*itr != ' '){
		   i++;
		   itr++;
		   }
		return i;
	}
	else if(itr != word.end() && *itr=='"'){
            int i = 2;
            itr++;
            while(*itr != '"'){
                if(*itr=='\'){
                    i++;
                    itr++;
                }
                 i++;
                    itr++;
		}
            return i;
	}
	else if(itr != word.end() && *itr=='/'){
		itr++;
		if(itr != word.end() && *itr=='/'){
                    int i = 1;
			while(itr != word.end() && *itr != 'n'){
                            i++;
                            itr++;
                        }
                    return i;
                }else if(itr != word.end() && *itr=='=')
                    return 2;
                else if(itr != word.end() && *itr=='*'){
                    int i = 3;
                    itr++;
                    do {
                        i++;
                        itr++;
                        }while(*itr != '/');//bug
                        return i;
                }
        }
	else if(itr != word.end() && *itr=='+'){
		itr++;
		if(itr != word.end() && *itr=='+')
			return 2;
		else if(itr != word.end() && *itr=='=')
			return 2;
		else
			return 1;
	}
	else if(itr != word.end() && *itr=='-'){
		itr++;
		if(itr != word.end() && *itr=='-')
			return 2;
		else if(itr != word.end() && *itr=='=')
			return 2;
		else if(itr != word.end() && *itr=='>')
			return 2;
		else
			return 1;

	}
        else if(itr != word.end() && *itr=='*'){
		itr++;
		if(itr != word.end() && *itr=='=')
			return 2;
		else if(itr != word.end() && *itr=='/')
			return 2;
		else
			return 1;

	}
	else if(itr != word.end() && *itr=='%'){
		itr++;
		if(itr != word.end() && *itr=='=')
			return 2;
		else
			return 1;
	}
	else if(itr != word.end() && *itr=='!'){
		itr++;
		if(itr != word.end() && *itr=='=')
			return 2;
		else
			return 1 ;
	}
        else if(itr != word.end() && *itr=='^'){
		itr++;
		if(itr != word.end() && *itr=='=')
			return 2;
		else{
			return 1;
		}
	}
	else if(itr != word.end() && *itr=='&'){
		itr++;
		if(itr != word.end() && *itr=='=')
			return 2;
		else if(itr != word.end() && *itr=='&')
			return 2;
		else
			return 1;
	}
	else if(itr != word.end() && *itr=='|'){
		itr++;
		if(itr != word.end() && *itr=='|')
			return 2;
		else if(itr != word.end() && *itr=='=')
			return 2;
		else
			return 1;
	}
	else if(itr != word.end() && *itr=='<'){ // bug,
		itr++;
		if(itr != word.end() && *itr=='<'){
			itr++;
			if(itr != word.end() && *itr=='=')
				return 3;
			else
				return 2;
		}
		else if(itr != word.end() && *itr=='=')
			return 2;
		else
			return 1;
	}
	else if(itr != word.end() && *itr=='>'){ // bug,
		itr++;
		if(itr != word.end() && *itr=='>'){
			itr++;
			if(itr != word.end() && *itr=='=')
				return 3;
			else
				return 2;
		}
		else if(itr != word.end() && *itr=='=')
			return 2;
		else
			return 1;
	}
        else if(itr != word.end() && *itr==':'){
		itr++;
		if(itr != word.end() && *itr==':')
                    return 2;
		return 1;
	}
        else if(itr != word.end() && *itr=='='){
		itr++;
		if(itr != word.end() && *itr=='=')
                    return 2;
		return 1;
	}

	else if(
                *itr==''' ||
                *itr==',' ||
                *itr=='~' ||
                *itr=='?' ||
                *itr=='.' ||
                *itr==';' ||
                *itr==':' ||
                *itr=='(' ||
                *itr==')' ||
                *itr=='{' ||
                *itr=='}' ||
                *itr=='[' ||
                *itr==']' ||
                *itr==' ' ||
                *itr=='r'||
                *itr=='n'
                )
                return 1;

	// not an operator
	return 0;

}

void identify_token(string str){

   // purge whitespace
    if ((!strcmp(str.c_str(), " ")))
            return;
    else if (!strcmp(str.c_str(), "t"))
        return;
    else if (!strcmp(str.c_str(), "r"))
        return;
    else if (!strcmp(str.c_str(), "n"))
        return;

    //check for operators
    else if (!strcmp(str.c_str(), "?"))
            cout << "questionmarkt" << str << endl;
    else if (!strcmp(str.c_str(), "'"))
            cout << "singlequotet" << str << endl;
    else if (!strcmp(str.c_str(), ";"))
            cout << "semicolont" << str << endl;
    else if (!strcmp(str.c_str(), "::"))
            cout << "scoperest" << str << endl;
    else if (!strcmp(str.c_str(), ":"))
            cout << "colon   t" << str << endl;
    else if (!strcmp(str.c_str(), "{"))
            cout << "leftcurleyt" << str << endl;
    else if (!strcmp(str.c_str(), "}"))
            cout << "rightcurlyt" << str << endl;
    else if (!strcmp(str.c_str(), "["))
            cout << "arraysubLt" << str << endl;
    else if (!strcmp(str.c_str(), "]"))
            cout << "arraysubRt" << str << endl;
    else if (!strcmp(str.c_str(), ".*"))
            cout << "pointtomembert" << str << endl;
    else if (!strcmp(str.c_str(), "."))
            cout << "dotoperatort" << str << endl;
    else if (!strcmp(str.c_str(), "->*"))
            cout << "pointtomtmbert" << str << endl;
    else if (!strcmp(str.c_str(), "->"))
            cout << "arrowt" << str << endl;
    else if (!strcmp(str.c_str(), "("))
            cout << "leftparent" << str << endl;
    else if (!strcmp(str.c_str(), ")"))
            cout << "rightparent" << str << endl;
    else if (!strcmp(str.c_str(), "++"))
            cout << "incrementt" << str << endl;
    else if (!strcmp(str.c_str(), "--"))
            cout << "decrementt" << str << endl;
    else if (!strcmp(str.c_str(), "typid"))
            cout << "type infot" << str << endl;
    else if (!strcmp(str.c_str(), "*_cast"))
            cout << "C++ castt" << str << endl;
    else if (!strcmp(str.c_str(), "sizeof"))
            cout << "size infot" << str << endl;
    else if (!strcmp(str.c_str(), "~"))
            cout << "bitwise NOTt" << str << endl;
    else if (!strcmp(str.c_str(), "!="))
            cout << "not equalt" << str << endl;
    else if (!strcmp(str.c_str(), "!"))
            cout << "NOT      t" << str << endl;
    else if (!strcmp(str.c_str(), "-="))
            cout << "sub&assignt" << str << endl;
    else if (!strcmp(str.c_str(), "-"))
            cout << "minust" << str << endl;
    else if (!strcmp(str.c_str(), "+="))
            cout << "add&assignt" << str << endl;
    else if (!strcmp(str.c_str(), "+"))
            cout << "add/concatt" << str << endl;
    else if (!strcmp(str.c_str(), "&&"))
            cout << "logicANDt" << str << endl;
    else if (!strcmp(str.c_str(), "&="))
            cout << "AND&assignt" << str << endl;
    else if (!strcmp(str.c_str(), "&"))
            cout << "address oft" << str << endl;
    else if (!strcmp(str.c_str(), "*="))
            cout << "mult&assignt" << str << endl;
    else if (!strcmp(str.c_str(), "*/"))
            cout << "closecommentt" << str << endl;
    else if (!strcmp(str.c_str(), "*"))
            cout << "mult/dereft" << str << endl;
    else if (!strcmp(str.c_str(), "new"))
            cout << "allocatet" << str << endl;
    else if (!strcmp(str.c_str(), "delete"))
            cout << "deallocatet" << str << endl;
    else if (!strcmp(str.c_str(), "/*"))
            cout << "opencommentt" << str << endl;
    else if (!strcmp(str.c_str(), "//"))       // ?
            cout << "linecommentt" << str << endl;
    else if (!strcmp(str.c_str(), "/"))
            cout << "divide opt" << str << endl;
    else if (!strcmp(str.c_str(), "%="))
            cout << "mod&assignt" << str << endl;
    else if (!strcmp(str.c_str(), "%"))
            cout << "modulot" << str << endl;
    else if (!strcmp(str.c_str(), "<<="))
            cout << "shftL&assignt" << str << endl;
    else if (!strcmp(str.c_str(), "<<"))
            cout << "shiftleftt" << str << endl;
    else if (!strcmp(str.c_str(), "<="))
            cout << "less or eqt" << str << endl;
    else if (!strcmp(str.c_str(), "<"))
            cout << "less thant" << str << endl;
    else if (!strcmp(str.c_str(), ">>="))
            cout << "shftR&assignt" << str << endl;
    else if (!strcmp(str.c_str(), ">>"))
            cout << "shiftrightt" << str << endl;
    else if (!strcmp(str.c_str(), ">="))
            cout << "greateroreqt" << str << endl;
    else if (!strcmp(str.c_str(), ">"))
            cout << "greater thant" << str << endl;
    else if (!strcmp(str.c_str(), "=="))
            cout << "equal tot" << str << endl;
    else if (!strcmp(str.c_str(), "="))
            cout << "assignmentt" << str << endl;
    else if (!strcmp(str.c_str(), "^="))
            cout << "XOR&assignt" << str << endl;
    else if (!strcmp(str.c_str(), "^"))
            cout << "bitwiseXORt" << str << endl;
    else if (!strcmp(str.c_str(), "||"))
            cout << "logic ORt" << str << endl;
    else if (!strcmp(str.c_str(), "|="))
            cout << "OR&assignt" << str << endl;
    else if (!strcmp(str.c_str(), "throw"))
            cout << "throwext" << str << endl;
    else if (!strcmp(str.c_str(), ","))
            cout << "sequencet" << str << endl;

    // check for reserved words
    else if (!strcmp(str.c_str(), "and"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "and_eq"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "asm"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "auto"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "bitand"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "bitor"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "bool"))
            cout << "Type    t" << str << endl;
    else if (!strcmp(str.c_str(), "break"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "case"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "catch"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "char"))
            cout << "Type    t" << str << endl;
    else if (!strcmp(str.c_str(), "class"))
            cout << "Classtypet" << str << endl;
    else if (!strcmp(str.c_str(), "compl"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "const"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "const_cast"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "continue"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "default"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "delete"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "do"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "double"))
            cout << "Type    t" << str << endl;
    else if (!strcmp(str.c_str(), "dynamic_cast"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "else"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "enum"))
            cout << "Grouptypet" << str << endl;
    else if (!strcmp(str.c_str(), "explicit"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "export"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "extern"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "false"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "float"))
            cout << "Type    t" << str << endl;
    else if (!strcmp(str.c_str(), "for"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "friend"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "goto"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "if"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "inline"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "int"))
            cout << "Type    t" << str << endl;
    else if (!strcmp(str.c_str(), "long"))
            cout << "Type    t" << str << endl;
    else if (!strcmp(str.c_str(), "mutable"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "namespace"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "not"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "not_eq"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "operator"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "or"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "or_eq"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "private"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "protected"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "public"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "register"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "reinterpret_cast"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "return"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "short"))
            cout << "Type    t" << str << endl;
    else if (!strcmp(str.c_str(), "signed"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "sizeof"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "static"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "static_cast"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "struct"))
            cout << "Classtypet" << str << endl;
    else if (!strcmp(str.c_str(), "switch"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "template"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "this"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "throw"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "true"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "try"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "typedef"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "typeid"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "typename"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "union"))
            cout << "Classtypet" << str << endl;
    else if (!strcmp(str.c_str(), "unsigned"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "using"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "virtual"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "void"))//
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "volitile"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "wchar_t"))
            cout << "Type    t" << str << endl;
    else if (!strcmp(str.c_str(), "while"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "xor"))
            cout << "Reservedt" << str << endl;
    else if (!strcmp(str.c_str(), "xor_eq"))
            cout << "Reservedt" << str << endl;


	// check for predefined words
    else if (!strcmp(str.c_str(), "cin"))
	cout << "Predefinedt" << str << endl;
    else if (!strcmp(str.c_str(), "endl"))
	cout << "Predefinedt" << str << endl;
    else if (!strcmp(str.c_str(), "main"))
	cout << "Predefinedt" << str << endl;
    else if (!strcmp(str.c_str(), "cout"))
	cout << "Predefinedt" << str << endl;
    else if (!strcmp(str.c_str(), "NULL"))
	cout << "Predefinedt" << str << endl;
    else if (!strcmp(str.c_str(), "string"))
	cout << "Predefinedt" << str << endl;

    // check for META
    else if (!strcmp(str.c_str(), "#define"))
        cout << "META     t" << str << endl;
    else if (!strcmp(str.c_str(), "#undef"))
        cout << "META     t" << str << endl;
    else if (!strcmp(str.c_str(), "#ifdef"))
        cout << "META     t" << str << endl;
    else if (!strcmp(str.c_str(), "#ifndef"))
        cout << "META     t" << str << endl;
    else if (!strcmp(str.c_str(), "#else"))
        cout << "META     t" << str << endl;
    else if (!strcmp(str.c_str(), "#endif"))
        cout << "META     t" << str << endl;
    else if (!strcmp(str.c_str(), "#if"))
        cout << "META     t" << str << endl;
    else if (!strcmp(str.c_str(), "#elif"))
        cout << "META     t" << str << endl;
    else if (!strcmp(str.c_str(), "#error"))
        cout << "META     t" << str << endl;
    else if (!strcmp(str.c_str(), "#line"))
        cout << "META     t" << str << endl;
    else if (!strcmp(str.c_str(), "#pragma"))
        cout << "META     t" << str << endl;
    else if (!strcmp(str.c_str(), "#include"))
        cout << "META     t" << str << endl;


    //check for numbers and floating points
    else if(str.find_first_not_of("1234567890") == string::npos)
	cout << "CONST_NUMt" << str << endl;
    else if(str.find_first_not_of(" 1234567890.", 0) == string::npos)
        cout << "CONST_FLOATt" << str << endl;

    // check for string
    else if (str[0] == '"' && str[str.length()-1]=='"')
            cout << "CONST_STRt" << str << endl;

    // check for line comments
    else if (str[0] == '/' && str[1]=='/')
            cout << "LINECOMMENTt" << str << endl;

    // check for block comments
    else if (str[0] == '/' && str[1] == '*' && str[str.length()-2]=='*' &&str[str.length()-1]=='/'){
            cout << "BLOCKCOMMENTt" << str; cout << endl;}

    // all checks complete, assuming whatever passes all checkes is a
    // valid identifier(not in real life, identifiers can't begin with numeric characters)
    // we are assuming that the input is a valid c++ file, so this shouldn't be an issue.
    else cout << "IDENTIFIERt" << str << endl;

}

Leave a Reply

Your email address will not be published.