Lexical analysis is step 1 of compiling code down to machine language. The process breaks source code down into a long list of pieces called tokens. This list of tokens is used by a parser algorithm that extracts meaning from the order and arrangement of the tokens. Here is a small example of lex analysis:
code:
int main(void) {
float myvar = 2.5;
return 0;
}
list of tokens:
- int type
- main reserved word
- (
- void keyword
- )
- {
- float keyword
- myvar identifier
- = operator
- 2.5 floating point constant
- ; end statement
- return keyword
- 0 integer constant
- ; end statemant
- }
As you can see, the list of tokens gets long rather quickly. Absolutely no syntax checking is done during lex. That happens later down the line.
I have written a basic lexical analyzer to break C++ code into tokens. Its own source code is C++ as well.
// B.K. Turley
// 1/26/2011
// Compile with g++ only, The nesting is limited to 128 in VS2010.
// This limit is exceeded by the massive if..else..if..else statement
// in identify_token()
// Known bugs:
// block comments are wrongly terminated with / instead of */
// Tokens are also stored in a linked list for later processing
#include <iostream> // Include <iostream> whenever using C++ I/O (cin and cout)
#include <iomanip> // Include <iomanip> along with iostream and you can't go wrong
#include <fstream> // Include <fstream> whenever working with external files.
#include <string> // Include <string> whenever using variable of type string.
#include <sstream>
#include <list>
using namespace std;
int find_split_point(string word);
//accepts a string, returns a integer representing the length of the first token in the string
void identify_token(string str);
//accepts a string, pritnt the type of token (operator, reverved, meta ect.) followed by str.
list<string> tokens;
int main()
{
string filename = "";
fstream in;
while(!in.is_open()){
cout << "Enter a valid C++ file file to tokenize ";
cin >> filename;
in.open(filename.c_str());
}
std::stringstream buffer;
buffer << in.rdbuf();
string lexme(buffer.str());
//begin lex
while(lexme != ""){
int splitpoint = find_split_point(lexme);// if splitpoint is set to anything but zero here, string begins with operator or META
if(splitpoint==0)// string doesn't begin with op or meta. so set splitpoint to the index to the first occurence of space, operator, or META
splitpoint = lexme.find_first_of(" #\"/=+-*%!^&|<>~,?.:;()[]{}\t");
if(splitpoint==lexme.length()){ // this only executes whenever the very last piece of the string contains no ops or META
identify_token(lexme);
lexme = "";
}
else{
if(splitpoint==0) splitpoint++;
string token = lexme.substr(0,splitpoint);
identify_token(token);
lexme = lexme.substr(splitpoint, lexme.length()-splitpoint);
}
}
return 0;
}
int find_split_point(string word){
// tests if first character is an operator(or META) and returns its length.
/*
examples:
isfirstcharoperator("hello there") returns zero.
isfirstcharoperator("++i;") returns 2.
isfirstcharoperator("!!crazy!") returns 1. (there is no !! operator)
isfirstcharoperator("#include <string>") returns 8. (#inclued is a meta word)
*/
string::iterator itr=word.begin();
if(itr != word.end() && *itr=='#'){
int i = 0;
while(*itr != ' '){
i++;
itr++;
}
return i;
}
else if(itr != word.end() && *itr=='"'){
int i = 2;
itr++;
while(*itr != '"'){
if(*itr=='\\'){
i++;
itr++;
}
i++;
itr++;
}
return i;
}
else if(itr != word.end() && *itr=='/'){
itr++;
if(itr != word.end() && *itr=='/'){
int i = 1;
while(itr != word.end() && *itr != '\n'){
i++;
itr++;
}
return i;
}else if(itr != word.end() && *itr=='=')
return 2;
else if(itr != word.end() && *itr=='*'){
int i = 3;
itr++;
do {
i++;
itr++;
}while(*itr != '/');//bug
return i;
}
}
else if(itr != word.end() && *itr=='+'){
itr++;
if(itr != word.end() && *itr=='+')
return 2;
else if(itr != word.end() && *itr=='=')
return 2;
else
return 1;
}
else if(itr != word.end() && *itr=='-'){
itr++;
if(itr != word.end() && *itr=='-')
return 2;
else if(itr != word.end() && *itr=='=')
return 2;
else if(itr != word.end() && *itr=='>')
return 2;
else
return 1;
}
else if(itr != word.end() && *itr=='*'){
itr++;
if(itr != word.end() && *itr=='=')
return 2;
else if(itr != word.end() && *itr=='/')
return 2;
else
return 1;
}
else if(itr != word.end() && *itr=='%'){
itr++;
if(itr != word.end() && *itr=='=')
return 2;
else
return 1;
}
else if(itr != word.end() && *itr=='!'){
itr++;
if(itr != word.end() && *itr=='=')
return 2;
else
return 1 ;
}
else if(itr != word.end() && *itr=='^'){
itr++;
if(itr != word.end() && *itr=='=')
return 2;
else{
return 1;
}
}
else if(itr != word.end() && *itr=='&'){
itr++;
if(itr != word.end() && *itr=='=')
return 2;
else if(itr != word.end() && *itr=='&')
return 2;
else
return 1;
}
else if(itr != word.end() && *itr=='|'){
itr++;
if(itr != word.end() && *itr=='|')
return 2;
else if(itr != word.end() && *itr=='=')
return 2;
else
return 1;
}
else if(itr != word.end() && *itr=='<'){ // bug,
itr++;
if(itr != word.end() && *itr=='<'){
itr++;
if(itr != word.end() && *itr=='=')
return 3;
else
return 2;
}
else if(itr != word.end() && *itr=='=')
return 2;
else
return 1;
}
else if(itr != word.end() && *itr=='>'){ // bug,
itr++;
if(itr != word.end() && *itr=='>'){
itr++;
if(itr != word.end() && *itr=='=')
return 3;
else
return 2;
}
else if(itr != word.end() && *itr=='=')
return 2;
else
return 1;
}
else if(itr != word.end() && *itr==':'){
itr++;
if(itr != word.end() && *itr==':')
return 2;
return 1;
}
else if(itr != word.end() && *itr=='='){
itr++;
if(itr != word.end() && *itr=='=')
return 2;
return 1;
}
else if(
*itr=='\'' ||
*itr==',' ||
*itr=='~' ||
*itr=='?' ||
*itr=='.' ||
*itr==';' ||
*itr==':' ||
*itr=='(' ||
*itr==')' ||
*itr=='{' ||
*itr=='}' ||
*itr=='[' ||
*itr==']' ||
*itr==' ' ||
*itr=='\r'||
*itr=='\n'
)
return 1;
// not an operator
return 0;
}
void identify_token(string str){
// purge whitespace
if ((!strcmp(str.c_str(), " ")))
return;
else if (!strcmp(str.c_str(), "\t"))
return;
else if (!strcmp(str.c_str(), "\r"))
return;
else if (!strcmp(str.c_str(), "\n"))
return;
//check for operators
else if (!strcmp(str.c_str(), "?"))
cout << "questionmark\t" << str << endl;
else if (!strcmp(str.c_str(), "'"))
cout << "singlequote\t" << str << endl;
else if (!strcmp(str.c_str(), ";"))
cout << "semicolon\t" << str << endl;
else if (!strcmp(str.c_str(), "::"))
cout << "scoperes\t" << str << endl;
else if (!strcmp(str.c_str(), ":"))
cout << "colon \t" << str << endl;
else if (!strcmp(str.c_str(), "{"))
cout << "leftcurley\t" << str << endl;
else if (!strcmp(str.c_str(), "}"))
cout << "rightcurly\t" << str << endl;
else if (!strcmp(str.c_str(), "["))
cout << "arraysubL\t" << str << endl;
else if (!strcmp(str.c_str(), "]"))
cout << "arraysubR\t" << str << endl;
else if (!strcmp(str.c_str(), ".*"))
cout << "pointtomember\t" << str << endl;
else if (!strcmp(str.c_str(), "."))
cout << "dotoperator\t" << str << endl;
else if (!strcmp(str.c_str(), "->*"))
cout << "pointtomtmber\t" << str << endl;
else if (!strcmp(str.c_str(), "->"))
cout << "arrow\t" << str << endl;
else if (!strcmp(str.c_str(), "("))
cout << "leftparen\t" << str << endl;
else if (!strcmp(str.c_str(), ")"))
cout << "rightparen\t" << str << endl;
else if (!strcmp(str.c_str(), "++"))
cout << "increment\t" << str << endl;
else if (!strcmp(str.c_str(), "--"))
cout << "decrement\t" << str << endl;
else if (!strcmp(str.c_str(), "typid"))
cout << "type info\t" << str << endl;
else if (!strcmp(str.c_str(), "*_cast"))
cout << "C++ cast\t" << str << endl;
else if (!strcmp(str.c_str(), "sizeof"))
cout << "size info\t" << str << endl;
else if (!strcmp(str.c_str(), "~"))
cout << "bitwise NOT\t" << str << endl;
else if (!strcmp(str.c_str(), "!="))
cout << "not equal\t" << str << endl;
else if (!strcmp(str.c_str(), "!"))
cout << "NOT \t" << str << endl;
else if (!strcmp(str.c_str(), "-="))
cout << "sub&assign\t" << str << endl;
else if (!strcmp(str.c_str(), "-"))
cout << "minus\t" << str << endl;
else if (!strcmp(str.c_str(), "+="))
cout << "add&assign\t" << str << endl;
else if (!strcmp(str.c_str(), "+"))
cout << "add/concat\t" << str << endl;
else if (!strcmp(str.c_str(), "&&"))
cout << "logicAND\t" << str << endl;
else if (!strcmp(str.c_str(), "&="))
cout << "AND&assign\t" << str << endl;
else if (!strcmp(str.c_str(), "&"))
cout << "address of\t" << str << endl;
else if (!strcmp(str.c_str(), "*="))
cout << "mult&assign\t" << str << endl;
else if (!strcmp(str.c_str(), "*/"))
cout << "closecomment\t" << str << endl;
else if (!strcmp(str.c_str(), "*"))
cout << "mult/deref\t" << str << endl;
else if (!strcmp(str.c_str(), "new"))
cout << "allocate\t" << str << endl;
else if (!strcmp(str.c_str(), "delete"))
cout << "deallocate\t" << str << endl;
else if (!strcmp(str.c_str(), "/*"))
cout << "opencomment\t" << str << endl;
else if (!strcmp(str.c_str(), "//")) // ?
cout << "linecomment\t" << str << endl;
else if (!strcmp(str.c_str(), "/"))
cout << "divide op\t" << str << endl;
else if (!strcmp(str.c_str(), "%="))
cout << "mod&assign\t" << str << endl;
else if (!strcmp(str.c_str(), "%"))
cout << "modulo\t" << str << endl;
else if (!strcmp(str.c_str(), "<<="))
cout << "shftL&assign\t" << str << endl;
else if (!strcmp(str.c_str(), "<<"))
cout << "shiftleft\t" << str << endl;
else if (!strcmp(str.c_str(), "<="))
cout << "less or eq\t" << str << endl;
else if (!strcmp(str.c_str(), "<"))
cout << "less than\t" << str << endl;
else if (!strcmp(str.c_str(), ">>="))
cout << "shftR&assign\t" << str << endl;
else if (!strcmp(str.c_str(), ">>"))
cout << "shiftright\t" << str << endl;
else if (!strcmp(str.c_str(), ">="))
cout << "greateroreq\t" << str << endl;
else if (!strcmp(str.c_str(), ">"))
cout << "greater than\t" << str << endl;
else if (!strcmp(str.c_str(), "=="))
cout << "equal to\t" << str << endl;
else if (!strcmp(str.c_str(), "="))
cout << "assignment\t" << str << endl;
else if (!strcmp(str.c_str(), "^="))
cout << "XOR&assign\t" << str << endl;
else if (!strcmp(str.c_str(), "^"))
cout << "bitwiseXOR\t" << str << endl;
else if (!strcmp(str.c_str(), "||"))
cout << "logic OR\t" << str << endl;
else if (!strcmp(str.c_str(), "|="))
cout << "OR&assign\t" << str << endl;
else if (!strcmp(str.c_str(), "throw"))
cout << "throwex\t" << str << endl;
else if (!strcmp(str.c_str(), ","))
cout << "sequence\t" << str << endl;
// check for reserved words
else if (!strcmp(str.c_str(), "and"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "and_eq"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "asm"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "auto"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "bitand"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "bitor"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "bool"))
cout << "Type \t" << str << endl;
else if (!strcmp(str.c_str(), "break"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "case"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "catch"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "char"))
cout << "Type \t" << str << endl;
else if (!strcmp(str.c_str(), "class"))
cout << "Classtype\t" << str << endl;
else if (!strcmp(str.c_str(), "compl"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "const"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "const_cast"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "continue"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "default"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "delete"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "do"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "double"))
cout << "Type \t" << str << endl;
else if (!strcmp(str.c_str(), "dynamic_cast"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "else"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "enum"))
cout << "Grouptype\t" << str << endl;
else if (!strcmp(str.c_str(), "explicit"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "export"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "extern"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "false"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "float"))
cout << "Type \t" << str << endl;
else if (!strcmp(str.c_str(), "for"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "friend"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "goto"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "if"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "inline"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "int"))
cout << "Type \t" << str << endl;
else if (!strcmp(str.c_str(), "long"))
cout << "Type \t" << str << endl;
else if (!strcmp(str.c_str(), "mutable"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "namespace"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "not"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "not_eq"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "operator"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "or"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "or_eq"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "private"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "protected"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "public"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "register"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "reinterpret_cast"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "return"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "short"))
cout << "Type \t" << str << endl;
else if (!strcmp(str.c_str(), "signed"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "sizeof"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "static"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "static_cast"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "struct"))
cout << "Classtype\t" << str << endl;
else if (!strcmp(str.c_str(), "switch"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "template"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "this"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "throw"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "true"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "try"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "typedef"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "typeid"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "typename"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "union"))
cout << "Classtype\t" << str << endl;
else if (!strcmp(str.c_str(), "unsigned"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "using"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "virtual"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "void"))//
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "volitile"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "wchar_t"))
cout << "Type \t" << str << endl;
else if (!strcmp(str.c_str(), "while"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "xor"))
cout << "Reserved\t" << str << endl;
else if (!strcmp(str.c_str(), "xor_eq"))
cout << "Reserved\t" << str << endl;
// check for predefined words
else if (!strcmp(str.c_str(), "cin"))
cout << "Predefined\t" << str << endl;
else if (!strcmp(str.c_str(), "endl"))
cout << "Predefined\t" << str << endl;
else if (!strcmp(str.c_str(), "main"))
cout << "Predefined\t" << str << endl;
else if (!strcmp(str.c_str(), "cout"))
cout << "Predefined\t" << str << endl;
else if (!strcmp(str.c_str(), "NULL"))
cout << "Predefined\t" << str << endl;
else if (!strcmp(str.c_str(), "string"))
cout << "Predefined\t" << str << endl;
// check for META
else if (!strcmp(str.c_str(), "#define"))
cout << "META \t" << str << endl;
else if (!strcmp(str.c_str(), "#undef"))
cout << "META \t" << str << endl;
else if (!strcmp(str.c_str(), "#ifdef"))
cout << "META \t" << str << endl;
else if (!strcmp(str.c_str(), "#ifndef"))
cout << "META \t" << str << endl;
else if (!strcmp(str.c_str(), "#else"))
cout << "META \t" << str << endl;
else if (!strcmp(str.c_str(), "#endif"))
cout << "META \t" << str << endl;
else if (!strcmp(str.c_str(), "#if"))
cout << "META \t" << str << endl;
else if (!strcmp(str.c_str(), "#elif"))
cout << "META \t" << str << endl;
else if (!strcmp(str.c_str(), "#error"))
cout << "META \t" << str << endl;
else if (!strcmp(str.c_str(), "#line"))
cout << "META \t" << str << endl;
else if (!strcmp(str.c_str(), "#pragma"))
cout << "META \t" << str << endl;
else if (!strcmp(str.c_str(), "#include"))
cout << "META \t" << str << endl;
//check for numbers and floating points
else if(str.find_first_not_of("1234567890") == string::npos)
cout << "CONST_NUM\t" << str << endl;
else if(str.find_first_not_of(" 1234567890.", 0) == string::npos)
cout << "CONST_FLOAT\t" << str << endl;
// check for string
else if (str[0] == '"' && str[str.length()-1]=='"')
cout << "CONST_STR\t" << str << endl;
// check for line comments
else if (str[0] == '/' && str[1]=='/')
cout << "LINECOMMENT\t" << str << endl;
// check for block comments
else if (str[0] == '/' && str[1] == '*' && str[str.length()-2]=='*' &&str[str.length()-1]=='/'){
cout << "BLOCKCOMMENT\t" << str; cout << endl;}
// all checks complete, assuming whatever passes all checkes is a
// valid identifier(not in real life, identifiers can't begin with numeric characters)
// we are assuming that the input is a valid c++ file, so this shouldn't be an issue.
else cout << "IDENTIFIER\t" << str << endl;
}