Lexical analysis is step 1 of compiling code down to machine language. The process breaks source code down into a long list of pieces called tokens. This list of tokens is used by a parser algorithm that extracts meaning from the order and arrangement of the tokens. Here is a small example of lex analysis:
code:
int main(void) {
float myvar = 2.5;
return 0;
}
list of tokens:
- int type
- main reserved word
- (
- void keyword
- )
- {
- float keyword
- myvar identifier
- = operator
- 2.5 floating point constant
- ; end statement
- return keyword
- 0 integer constant
- ; end statemant
- }
As you can see, the list of tokens gets long rather quickly. Absolutely no syntax checking is done during lex. That happens later down the line.
I have written a basic lexical analyzer to break C++ code into tokens. Its own source code is C++ as well.
// B.K. Turley // 1/26/2011 // Compile with g++ only, The nesting is limited to 128 in VS2010. // This limit is exceeded by the massive if..else..if..else statement // in identify_token() // Known bugs: // block comments are wrongly terminated with / instead of */ // Tokens are also stored in a linked list for later processing #include <iostream> // Include <iostream> whenever using C++ I/O (cin and cout) #include <iomanip> // Include <iomanip> along with iostream and you can't go wrong #include <fstream> // Include <fstream> whenever working with external files. #include <string> // Include <string> whenever using variable of type string. #include <sstream> #include <list> using namespace std; int find_split_point(string word); //accepts a string, returns a integer representing the length of the first token in the string void identify_token(string str); //accepts a string, pritnt the type of token (operator, reverved, meta ect.) followed by str. list<string> tokens; int main() { string filename = ""; fstream in; while(!in.is_open()){ cout << "Enter a valid C++ file file to tokenize "; cin >> filename; in.open(filename.c_str()); } std::stringstream buffer; buffer << in.rdbuf(); string lexme(buffer.str()); //begin lex while(lexme != ""){ int splitpoint = find_split_point(lexme);// if splitpoint is set to anything but zero here, string begins with operator or META if(splitpoint==0)// string doesn't begin with op or meta. so set splitpoint to the index to the first occurence of space, operator, or META splitpoint = lexme.find_first_of(" #"/=+-*%!^&|<>~,?.:;()[]{}t"); if(splitpoint==lexme.length()){ // this only executes whenever the very last piece of the string contains no ops or META identify_token(lexme); lexme = ""; } else{ if(splitpoint==0) splitpoint++; string token = lexme.substr(0,splitpoint); identify_token(token); lexme = lexme.substr(splitpoint, lexme.length()-splitpoint); } } return 0; } int find_split_point(string word){ // tests if first character is an operator(or META) and returns its length. /* examples: isfirstcharoperator("hello there") returns zero. isfirstcharoperator("++i;") returns 2. isfirstcharoperator("!!crazy!") returns 1. (there is no !! operator) isfirstcharoperator("#include <string>") returns 8. (#inclued is a meta word) */ string::iterator itr=word.begin(); if(itr != word.end() && *itr=='#'){ int i = 0; while(*itr != ' '){ i++; itr++; } return i; } else if(itr != word.end() && *itr=='"'){ int i = 2; itr++; while(*itr != '"'){ if(*itr=='\'){ i++; itr++; } i++; itr++; } return i; } else if(itr != word.end() && *itr=='/'){ itr++; if(itr != word.end() && *itr=='/'){ int i = 1; while(itr != word.end() && *itr != 'n'){ i++; itr++; } return i; }else if(itr != word.end() && *itr=='=') return 2; else if(itr != word.end() && *itr=='*'){ int i = 3; itr++; do { i++; itr++; }while(*itr != '/');//bug return i; } } else if(itr != word.end() && *itr=='+'){ itr++; if(itr != word.end() && *itr=='+') return 2; else if(itr != word.end() && *itr=='=') return 2; else return 1; } else if(itr != word.end() && *itr=='-'){ itr++; if(itr != word.end() && *itr=='-') return 2; else if(itr != word.end() && *itr=='=') return 2; else if(itr != word.end() && *itr=='>') return 2; else return 1; } else if(itr != word.end() && *itr=='*'){ itr++; if(itr != word.end() && *itr=='=') return 2; else if(itr != word.end() && *itr=='/') return 2; else return 1; } else if(itr != word.end() && *itr=='%'){ itr++; if(itr != word.end() && *itr=='=') return 2; else return 1; } else if(itr != word.end() && *itr=='!'){ itr++; if(itr != word.end() && *itr=='=') return 2; else return 1 ; } else if(itr != word.end() && *itr=='^'){ itr++; if(itr != word.end() && *itr=='=') return 2; else{ return 1; } } else if(itr != word.end() && *itr=='&'){ itr++; if(itr != word.end() && *itr=='=') return 2; else if(itr != word.end() && *itr=='&') return 2; else return 1; } else if(itr != word.end() && *itr=='|'){ itr++; if(itr != word.end() && *itr=='|') return 2; else if(itr != word.end() && *itr=='=') return 2; else return 1; } else if(itr != word.end() && *itr=='<'){ // bug, itr++; if(itr != word.end() && *itr=='<'){ itr++; if(itr != word.end() && *itr=='=') return 3; else return 2; } else if(itr != word.end() && *itr=='=') return 2; else return 1; } else if(itr != word.end() && *itr=='>'){ // bug, itr++; if(itr != word.end() && *itr=='>'){ itr++; if(itr != word.end() && *itr=='=') return 3; else return 2; } else if(itr != word.end() && *itr=='=') return 2; else return 1; } else if(itr != word.end() && *itr==':'){ itr++; if(itr != word.end() && *itr==':') return 2; return 1; } else if(itr != word.end() && *itr=='='){ itr++; if(itr != word.end() && *itr=='=') return 2; return 1; } else if( *itr==''' || *itr==',' || *itr=='~' || *itr=='?' || *itr=='.' || *itr==';' || *itr==':' || *itr=='(' || *itr==')' || *itr=='{' || *itr=='}' || *itr=='[' || *itr==']' || *itr==' ' || *itr=='r'|| *itr=='n' ) return 1; // not an operator return 0; } void identify_token(string str){ // purge whitespace if ((!strcmp(str.c_str(), " "))) return; else if (!strcmp(str.c_str(), "t")) return; else if (!strcmp(str.c_str(), "r")) return; else if (!strcmp(str.c_str(), "n")) return; //check for operators else if (!strcmp(str.c_str(), "?")) cout << "questionmarkt" << str << endl; else if (!strcmp(str.c_str(), "'")) cout << "singlequotet" << str << endl; else if (!strcmp(str.c_str(), ";")) cout << "semicolont" << str << endl; else if (!strcmp(str.c_str(), "::")) cout << "scoperest" << str << endl; else if (!strcmp(str.c_str(), ":")) cout << "colon t" << str << endl; else if (!strcmp(str.c_str(), "{")) cout << "leftcurleyt" << str << endl; else if (!strcmp(str.c_str(), "}")) cout << "rightcurlyt" << str << endl; else if (!strcmp(str.c_str(), "[")) cout << "arraysubLt" << str << endl; else if (!strcmp(str.c_str(), "]")) cout << "arraysubRt" << str << endl; else if (!strcmp(str.c_str(), ".*")) cout << "pointtomembert" << str << endl; else if (!strcmp(str.c_str(), ".")) cout << "dotoperatort" << str << endl; else if (!strcmp(str.c_str(), "->*")) cout << "pointtomtmbert" << str << endl; else if (!strcmp(str.c_str(), "->")) cout << "arrowt" << str << endl; else if (!strcmp(str.c_str(), "(")) cout << "leftparent" << str << endl; else if (!strcmp(str.c_str(), ")")) cout << "rightparent" << str << endl; else if (!strcmp(str.c_str(), "++")) cout << "incrementt" << str << endl; else if (!strcmp(str.c_str(), "--")) cout << "decrementt" << str << endl; else if (!strcmp(str.c_str(), "typid")) cout << "type infot" << str << endl; else if (!strcmp(str.c_str(), "*_cast")) cout << "C++ castt" << str << endl; else if (!strcmp(str.c_str(), "sizeof")) cout << "size infot" << str << endl; else if (!strcmp(str.c_str(), "~")) cout << "bitwise NOTt" << str << endl; else if (!strcmp(str.c_str(), "!=")) cout << "not equalt" << str << endl; else if (!strcmp(str.c_str(), "!")) cout << "NOT t" << str << endl; else if (!strcmp(str.c_str(), "-=")) cout << "sub&assignt" << str << endl; else if (!strcmp(str.c_str(), "-")) cout << "minust" << str << endl; else if (!strcmp(str.c_str(), "+=")) cout << "add&assignt" << str << endl; else if (!strcmp(str.c_str(), "+")) cout << "add/concatt" << str << endl; else if (!strcmp(str.c_str(), "&&")) cout << "logicANDt" << str << endl; else if (!strcmp(str.c_str(), "&=")) cout << "AND&assignt" << str << endl; else if (!strcmp(str.c_str(), "&")) cout << "address oft" << str << endl; else if (!strcmp(str.c_str(), "*=")) cout << "mult&assignt" << str << endl; else if (!strcmp(str.c_str(), "*/")) cout << "closecommentt" << str << endl; else if (!strcmp(str.c_str(), "*")) cout << "mult/dereft" << str << endl; else if (!strcmp(str.c_str(), "new")) cout << "allocatet" << str << endl; else if (!strcmp(str.c_str(), "delete")) cout << "deallocatet" << str << endl; else if (!strcmp(str.c_str(), "/*")) cout << "opencommentt" << str << endl; else if (!strcmp(str.c_str(), "//")) // ? cout << "linecommentt" << str << endl; else if (!strcmp(str.c_str(), "/")) cout << "divide opt" << str << endl; else if (!strcmp(str.c_str(), "%=")) cout << "mod&assignt" << str << endl; else if (!strcmp(str.c_str(), "%")) cout << "modulot" << str << endl; else if (!strcmp(str.c_str(), "<<=")) cout << "shftL&assignt" << str << endl; else if (!strcmp(str.c_str(), "<<")) cout << "shiftleftt" << str << endl; else if (!strcmp(str.c_str(), "<=")) cout << "less or eqt" << str << endl; else if (!strcmp(str.c_str(), "<")) cout << "less thant" << str << endl; else if (!strcmp(str.c_str(), ">>=")) cout << "shftR&assignt" << str << endl; else if (!strcmp(str.c_str(), ">>")) cout << "shiftrightt" << str << endl; else if (!strcmp(str.c_str(), ">=")) cout << "greateroreqt" << str << endl; else if (!strcmp(str.c_str(), ">")) cout << "greater thant" << str << endl; else if (!strcmp(str.c_str(), "==")) cout << "equal tot" << str << endl; else if (!strcmp(str.c_str(), "=")) cout << "assignmentt" << str << endl; else if (!strcmp(str.c_str(), "^=")) cout << "XOR&assignt" << str << endl; else if (!strcmp(str.c_str(), "^")) cout << "bitwiseXORt" << str << endl; else if (!strcmp(str.c_str(), "||")) cout << "logic ORt" << str << endl; else if (!strcmp(str.c_str(), "|=")) cout << "OR&assignt" << str << endl; else if (!strcmp(str.c_str(), "throw")) cout << "throwext" << str << endl; else if (!strcmp(str.c_str(), ",")) cout << "sequencet" << str << endl; // check for reserved words else if (!strcmp(str.c_str(), "and")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "and_eq")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "asm")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "auto")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "bitand")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "bitor")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "bool")) cout << "Type t" << str << endl; else if (!strcmp(str.c_str(), "break")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "case")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "catch")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "char")) cout << "Type t" << str << endl; else if (!strcmp(str.c_str(), "class")) cout << "Classtypet" << str << endl; else if (!strcmp(str.c_str(), "compl")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "const")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "const_cast")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "continue")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "default")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "delete")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "do")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "double")) cout << "Type t" << str << endl; else if (!strcmp(str.c_str(), "dynamic_cast")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "else")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "enum")) cout << "Grouptypet" << str << endl; else if (!strcmp(str.c_str(), "explicit")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "export")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "extern")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "false")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "float")) cout << "Type t" << str << endl; else if (!strcmp(str.c_str(), "for")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "friend")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "goto")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "if")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "inline")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "int")) cout << "Type t" << str << endl; else if (!strcmp(str.c_str(), "long")) cout << "Type t" << str << endl; else if (!strcmp(str.c_str(), "mutable")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "namespace")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "not")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "not_eq")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "operator")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "or")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "or_eq")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "private")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "protected")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "public")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "register")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "reinterpret_cast")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "return")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "short")) cout << "Type t" << str << endl; else if (!strcmp(str.c_str(), "signed")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "sizeof")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "static")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "static_cast")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "struct")) cout << "Classtypet" << str << endl; else if (!strcmp(str.c_str(), "switch")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "template")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "this")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "throw")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "true")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "try")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "typedef")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "typeid")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "typename")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "union")) cout << "Classtypet" << str << endl; else if (!strcmp(str.c_str(), "unsigned")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "using")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "virtual")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "void"))// cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "volitile")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "wchar_t")) cout << "Type t" << str << endl; else if (!strcmp(str.c_str(), "while")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "xor")) cout << "Reservedt" << str << endl; else if (!strcmp(str.c_str(), "xor_eq")) cout << "Reservedt" << str << endl; // check for predefined words else if (!strcmp(str.c_str(), "cin")) cout << "Predefinedt" << str << endl; else if (!strcmp(str.c_str(), "endl")) cout << "Predefinedt" << str << endl; else if (!strcmp(str.c_str(), "main")) cout << "Predefinedt" << str << endl; else if (!strcmp(str.c_str(), "cout")) cout << "Predefinedt" << str << endl; else if (!strcmp(str.c_str(), "NULL")) cout << "Predefinedt" << str << endl; else if (!strcmp(str.c_str(), "string")) cout << "Predefinedt" << str << endl; // check for META else if (!strcmp(str.c_str(), "#define")) cout << "META t" << str << endl; else if (!strcmp(str.c_str(), "#undef")) cout << "META t" << str << endl; else if (!strcmp(str.c_str(), "#ifdef")) cout << "META t" << str << endl; else if (!strcmp(str.c_str(), "#ifndef")) cout << "META t" << str << endl; else if (!strcmp(str.c_str(), "#else")) cout << "META t" << str << endl; else if (!strcmp(str.c_str(), "#endif")) cout << "META t" << str << endl; else if (!strcmp(str.c_str(), "#if")) cout << "META t" << str << endl; else if (!strcmp(str.c_str(), "#elif")) cout << "META t" << str << endl; else if (!strcmp(str.c_str(), "#error")) cout << "META t" << str << endl; else if (!strcmp(str.c_str(), "#line")) cout << "META t" << str << endl; else if (!strcmp(str.c_str(), "#pragma")) cout << "META t" << str << endl; else if (!strcmp(str.c_str(), "#include")) cout << "META t" << str << endl; //check for numbers and floating points else if(str.find_first_not_of("1234567890") == string::npos) cout << "CONST_NUMt" << str << endl; else if(str.find_first_not_of(" 1234567890.", 0) == string::npos) cout << "CONST_FLOATt" << str << endl; // check for string else if (str[0] == '"' && str[str.length()-1]=='"') cout << "CONST_STRt" << str << endl; // check for line comments else if (str[0] == '/' && str[1]=='/') cout << "LINECOMMENTt" << str << endl; // check for block comments else if (str[0] == '/' && str[1] == '*' && str[str.length()-2]=='*' &&str[str.length()-1]=='/'){ cout << "BLOCKCOMMENTt" << str; cout << endl;} // all checks complete, assuming whatever passes all checkes is a // valid identifier(not in real life, identifiers can't begin with numeric characters) // we are assuming that the input is a valid c++ file, so this shouldn't be an issue. else cout << "IDENTIFIERt" << str << endl; }