\documentclass[a4paper,11pt]{article} \usepackage[utf8]{inputenc} \usepackage{listings} \usepackage{hyperref} %\VignetteIndexEntry{Examples} \lstset{ breaklines, breakatwhitespace=false, showstringspaces=false, xleftmargin=11pt, xrightmargin=15pt, language=R} \title{ Parser combinator in R (\emph{rmoparser}) : Vignette} \SweaveOpts{keep.source=TRUE,echo=TRUE} \begin{document} \maketitle \tableofcontents %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Simple mathematical expressions} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Prefix notation} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Following \href{http://en.wikipedia.org/wiki/Operator-precedence_parser}{Operator Precedence Parser} entry in Wikipedia, we will construct a parser for simple mathematical expressions that turns infix notation into prefix notation. For example, given the expression \begin{center} 3 + 3 \end{center} we will obtain \begin{center} (+ (NUM 3) (NUM 3) ) \end{center} or given \begin{center} x * sin(y) \end{center} we will obtain \begin{center} (* (VAR x) (sin (VAR y)) ) \end{center} These prefix expressions can be easily evaluated to obtain their numerical value or to perform symbolic differentiation. The grammar we will implement can be defined as follows: \lstset{language=} \begin{lstlisting} list_expression = additive_expression, ws, ';', { additive_expression, ws, ';' } ; additive_expression = ws, multiplicative_expression, { ws, ( '+' | '-' ), ws, multiplicative_expression } ; multiplicative_expression = power_expression, { ws, ( '*' | '/' ), ws, power_expression } ; power_expression = primary, [ ws, '**', ws, primary ] ; primary = '(', ws, additive_expression, ws, ')' | '-', ws, primary | FUN, ws, '(', ws, additive_expression, ws, ')' | NUMBER | VARIABLE ; NUMBER = ? numberScientific ? ; VARIABLE = ? symbolic ? ; FUN = ? symbolic ? ; ws = ? whitespace ? ; \end{lstlisting} \lstset{language=R} First rule, list\_expression, implements recognition of a list of mathematical expressions delimited by semicolons ';'. The only action of this rule is printing the output. <>= list_expression <- function() concatenation( concatenation(additive_expression(),ws(),keyword(';'), action=function(s) print(exprToString(s[[1]]))), repetition0N( concatenation(additive_expression(),ws(),keyword(';'), action=function(s) print(exprToString(s[[1]])))), action=function(s) NULL) @ Second rule, additive\_expression, implements sums and subtractions recognition, which are lower precedence operations. It generates a list following the pattern (+ operator1 operator2). <>= library(qmrparser) additive_expression <- function() concatenation(ws(),multiplicative_expression(), option( concatenation(ws(), alternation( keyword('+',action=function(s) s), keyword('-',action=function(s) s), action=function(s) s), ws(),additive_expression(), action=function(s) list(type='noempty',value=s[c(2,4)]))), action=function(s) {if(s[[3]]$value$type=='empty') s[[2]] else list(fun=s[[3]]$value$value[[1]],par1=s[[2]],par2=s[[3]]$value$value[[2]]) }); @ Next rule, multiplicative\_expression, implements products and divisions recognition, which have higher precedence than sums and subtractions but lower than exponentiation or functions. It generates a list following the pattern (* operator1 operator2). <>= multiplicative_expression <- function() concatenation(power_expression(), option( concatenation(ws(), alternation( keyword('*',action=function(s) s), keyword('/',action=function(s) s), action=function(s) s), ws(),multiplicative_expression(), action=function(s) {list(type='noempty',value=s[c(2,4)])})), action=function(s) { if(s[[2]]$value$type=='empty') s[[1]] else list(fun=s[[2]]$value$value[[1]],par1=s[[1]],par2=s[[2]]$value$value[[2]])}); @ Rule power\_expression recognises exponentiation. It generates a list following the pattern (\^ operator1 operator2). <>= power_expression <- function() concatenation(primary(), option( concatenation(ws(),keyword('**'),ws(),power_expression(), action=function(s) list(type='noempty',value=s[[4]]))), action=function(s){if(s[[2]]$value$type=='empty') s[[1]] else list(fun="^",par1=s[[1]],par2=s[[2]]$value$value)}); @ Lastly, basic elements are defined: \begin{itemize} \item Expression grouping within parenthesis \item Unary minus \item Functions \item Numbers \item Variables \end{itemize} <>= primary <- function() alternation( concatenation(charParser('('),ws(),additive_expression(),ws(),charParser(')'),action=function(s) s[[3]]), concatenation(charParser('-'),ws(),primary(), action=function(s) list(fun="U-",par1=s[[3]])), concatenation(FUN(action=function(s) s), ws(), charParser('('), ws(), additive_expression(), ws(), charParser(')'), action=function(s) list(fun=s[[1]],par1=s[[5]])), NUMBER (action=function(s) list(fun="NUM", par1=s)), VARIABLE(action=function(s) list(fun="VAR", par1=s)), action=function(s) s); @ Auxiliary functions which implement recognition for numbers, variables, function names or blanks. <>= NUMBER <- function(...) numberScientific(...); VARIABLE <- function(...) symbolic(...); FUN <- function(...) symbolic(...); ws <- function() whitespace(); @ Moreover, in order to print in a clean way prefix notation: <>= exprToString <- function(expr) if ( !is.list(expr) ) as.character(expr) else paste("(",paste(sapply(expr,exprToString,USE.NAMES=FALSE),collapse=" "),")") @ And some examples: <>= print("Infix to Prefix Examples") invisible( list_expression()(streamParserFromString(" 8 ;")) ) invisible( list_expression()(streamParserFromString("8 +4;")) ) invisible( list_expression()(streamParserFromString("8/2 ;")) ) invisible( list_expression()(streamParserFromString("8*2 ;")) ) invisible( list_expression()(streamParserFromString("2*3 + 4*5;")) ) invisible( list_expression()(streamParserFromString("sqrt( 16) ;")) ) invisible( list_expression()(streamParserFromString("sin(3.1415) ;")) ) invisible( list_expression()(streamParserFromString("sin(3.14* (2*2+3+1)/2 ) ** 8;")) ) invisible( list_expression()(streamParserFromString("sqrt(16)**2+sin(3)-sin(3);")) ) invisible( list_expression()(streamParserFromString("sqrt(16)**2+sin(3)-sin(3)*2;") ) ) @ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Calculator} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% In order to get a calculator, we only have to substitute the function which prints in a clean way by another which performs calculations: <>= exprToNumber <- function(expr) switch(expr[[1]], 'NUM'= as.numeric(expr[[2]]), 'VAR' =as.numeric(get(expr[[2]])), 'U-'=-exprToNumber(expr[[2]]), do.call(expr[[1]],unname(lapply(expr[-1],exprToNumber))) ) @ and modify the list\_expression function <>= list_expression <- function() concatenation( concatenation(additive_expression(),ws(),keyword(';'), action=function(s) print(exprToString(exprToNumber(s[[1]])))), repetition0N( concatenation(additive_expression(),ws(),keyword(';'), action=function(s) print(exprToString(exprToNumber(s[[1]]))))), action=function(s) NULL); @ Some examples: <>= print("Calculator") invisible( list_expression()(streamParserFromString(" 8 ;")) ) invisible( list_expression()(streamParserFromString("8 +4;")) ) invisible( list_expression()(streamParserFromString("8/2 ;")) ) invisible( list_expression()(streamParserFromString("8*2 ;")) ) invisible( list_expression()(streamParserFromString("2*3 + 4*5;")) ) invisible( list_expression()(streamParserFromString("sqrt( 16) ;")) ) invisible( list_expression()(streamParserFromString("sin(3.1415) ;")) ) invisible( list_expression()(streamParserFromString("sin(3.14* (2*2+3+1)/2 ) ** 8;")) ) invisible( list_expression()(streamParserFromString("sqrt(16)**2+sin(3)-sin(3);")) ) invisible( list_expression()(streamParserFromString("sqrt(16)**2+sin(3)-sin(3)*2;")) ) @ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Symbolic differentiation} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% In order to get a symbolic derivatives calculator, we only need to substitute the function which prints in a clean way by another function which performs differentiation. Differentiation is performed in prefix notation and, as it is only an example, output is not printed in infix notation. Moreover, it is not implemented any set of functions neither simplification after differentiation. <>= exprDeriv <- function(expr,var) switch(expr[[1]], 'NUM'= list("NUM", "0"), 'VAR' = if( expr[[2]] == var ) list("NUM" ,"1") else list("NUM", "0"), "+"=,"-"= list(expr[[1]],exprDeriv(expr[[2]],var),exprDeriv(expr[[3]],var)), "*" =list("+", list("*",expr[[2]],exprDeriv(expr[[3]],var)), list("*",expr[[3]],exprDeriv(expr[[2]],var)) ), "/" =list("*", list("-", list("*",expr[[3]],exprDeriv(expr[[2]],var)), list("*",expr[[2]],exprDeriv(expr[[3]],var)) ), list("**",expr[[3]],"2") ), "sin"=list("*",exprDeriv(expr[[2]],var),list("cos", expr[[2]])), list(paste("Diff",var,sep="_"),expr) ) @ list\_expression function must be modified: <>= list_expression <- function() concatenation( concatenation(additive_expression(),ws(),keyword(';'), action=function(s) print(exprToString(exprDeriv(s[[1]],"x")))), repetition0N( concatenation(additive_expression(),ws(),keyword(';'), action=function(s) print(exprToString(exprDeriv(s[[1]],"x"))))), action=function(s) NULL); @ Some examples: <>= print("Differentiation") invisible( list_expression()(streamParserFromString(" 8 ;")) ) invisible( list_expression()(streamParserFromString(" x ;")) ) invisible( list_expression()(streamParserFromString("8 +x;")) ) invisible( list_expression()(streamParserFromString("x/2 ;")) ) invisible( list_expression()(streamParserFromString("8*x ;")) ) invisible( list_expression()(streamParserFromString("2*x + 4*x;")) ) invisible( list_expression()(streamParserFromString("1+sqrt( x) ;")) ) invisible( list_expression()(streamParserFromString("sin(x) ;")) ) invisible( list_expression()(streamParserFromString("sin(x* (2*2+x+1)/2 ) ** 8;")) ) @ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Wikipedia example} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% In \href{http://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_Form}{ Extended Backus–Naur Form} can be found the following grammar definition example: \lstset{language=Tex} \begin{lstlisting} (* a simple program syntax in EBNF - Wikipedia *) program = 'PROGRAM' , white space , identifier , white space , 'BEGIN' , white space , { assignment , ";" , white space } , 'END.' ; identifier = alphabetic character , { alphabetic character | digit } ; number = [ "-" ] , digit , { digit } ; string = '"' , { all characters - '"' } , '"' ; assignment = identifier , ":=" , ( number | identifier | string ) ; alphabetic character = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" ; digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ; white space = ? white space characters ? ; all characters = ? all visible characters ? ; \end{lstlisting} Grammar definition is done in a language which has its own grammar. The wikipedia article does not make a formal definition of this grammar, but it does enummerate which symbols are used: \begin{lstlisting} Table of symbols The following represents a proposed standard. Usage Notation definition = concatenation , termination ; alternation | option [ ... ] repetition { ... } grouping ( ... ) terminal string " ... " terminal string ' ... ' comment (* ... *) special sequence ? ... ? exception - \end{lstlisting} \lstset{language=R} The example developed next allows grammar recognition using these symbols and will recognise a simple program syntax in EBNF from Wikipedia example, while creating a parser for the defined grammar. It is just a practical example to illustrate package function usage and it is not intended to recognise or parse arbitrary free context grammars. For this last purpose, the following references may be useful: \begin{itemize} \item \href{http://en.wikipedia.org/wiki/List_of_algorithms#Parsing}{Parsing} \item \href{http://en.wikipedia.org/wiki/Earley%27s_algorithm}{Earley parser} \end{itemize} A grammar is a collection/repetition of rules, ebnfRule. Some of these rules may be delimited by blanks and they end where the file containing grammar definition ends. <>= gramatica <- function() concatenation( repetition1N( concatenation(ebnfRule(),whitespace(),action=function(s) s[[1]]), action=function(s) s), eofMark(error=function(p) errorFun(p,h=NULL,type="eofMark")), action=function(s) unlist(s[[1]]) ) @ A grammatical rule has a name, 'rule\_name', a definition and ends with a semi-colon ';' \begin{lstlisting} rule_name = definition ; \end{lstlisting} 'rule\_name' is a symbol, a sequences of letters, numbers and '\_' beginning with a letter. Therefore, rule definition is in parser notation: \begin{lstlisting} symbolic(),charParser("="),ebnfDefinition(),charParser(';') \end{lstlisting} And taking into account blanks, concatenation and sequentially processing: <>= ebnfRule <- function() concatenation( whitespace(), symbolic(charFirst=isLetter,charRest=function(ch) isLetter(ch) || isDigit(ch) || ch == "_",action=function(s) s), whitespace(),charParser("="), whitespace(),ebnfDefinition(),whitespace(),charParser(';'),whitespace(), action=function(s) paste(s[[2]]," <- function() ", s[[6]])) @ A rule definition may be made up of just one or different possible alternatives. <>= ebnfDefinition <- function() alternation( # several alternatives ebnfAlternation(), # No alternatives ebnfNonAlternation(), action=function(s) s) @ Alternatives making up a definition are delimited by ``|'' <>= ebnfAlternation <- function() concatenation( ebnfNonAlternation(), repetition1N( concatenation(whitespace(),charParser("|"),whitespace(),ebnfNonAlternation(),action=function(s) s[[4]]),action=function(s) s), action=function(s) paste("alternation(",paste(s[[1]],",",paste(unlist(s[[2]]), collapse=","), sep=""),")",sep="")) @ The ways of rule definition which are not a list of alternatives is: \begin{itemize} \item A concatenation of definitions <>= ebnfConcatenation <- function() option( concatenation( whitespace(),charParser(","),whitespace(), ebnfNonAlternation(), action=function(s) list(type="noempty",value=s[[4]]))) @ this concatenation definition is added up to the other possibilities on the right side to avoid a problem with left recursive grammatical rules. \SweaveOpts{eval=FALSE} \item String <>= # string concatenation( string(action=function(s) paste("keyword('",s,"')",sep="")), ebnfConcatenation(), action=function (s) if(s[[2]]$value$type=="empty") s[[1]] else paste("concatenation(",s[[1]],",",s[[2]]$value$value,")",sep="")) @ \item A special sequence recognising non-string terminal symbols. <>= # special sequence concatenation( ebnfSpecialSequence(), ebnfConcatenation(), action=function (s) if(s[[2]]$value$type=="empty") s[[1]] else paste("concatenation(",s[[1]],",",s[[2]]$value$value,")",sep="")) @ \item A symbol referencing a defined rule. <>= # rule call concatenation( symbolic(charFirst=isLetter,charRest=function(ch) isLetter(ch) || isDigit(ch) || ch == "_",action=function(s) paste(s,"()",sep="")), ebnfConcatenation(), action=function (s) if(s[[2]]$value$type=="empty") s[[1]] else paste("concatenation(",s[[1]],",",s[[2]]$value$value,")",sep="")) @ \item A parenthesised definition grouping other definitions <>= # grouping concatenation( whitespace(),charParser("("),whitespace(), ebnfDefinition(), whitespace(),charParser(")"), ebnfConcatenation(), action=function (s) if(s[[7]]$value$type=="empty") s[[4]] else paste("concatenation(",s[[4]],",",s[[7]]$value$value,")",sep="")) @ \item A definition repetition. <>= # repetition concatenation( whitespace(),charParser("{"),whitespace(), ebnfDefinition(), whitespace(),charParser("}"), ebnfConcatenation(), action=function (s) if(s[[7]]$value$type=="empty") paste("repetition0N(",s[[4]],")") else paste("concatenation(", paste("repetition0N(",s[[4]],")"),",", s[[7]]$value$value,")",sep="")) @ \item Optional application of a definition. <>= # option concatenation( whitespace(),charParser("["),whitespace(), ebnfDefinition(), whitespace(),charParser("]"), ebnfConcatenation(), action=function (s) if(s[[7]]$value$type=="empty") paste("option(",s[[4]],")") else paste("concatenation(", paste("option(",s[[4]],")"),",", s[[7]]$value$value,")",sep="")) @ \end{itemize} \SweaveOpts{eval=TRUE} Therefore, rule definition can be written as: <>= <> ebnfNonAlternation <- function() alternation( <> , <> , <> , <> , <> , <> , action=function(s) s) <> @ 'Special Sequence' definition will be associated with basic tokens. <>= ebnfSpecialSequence <- function() concatenation(whitespace(),charParser("?"),whitespace(), alternation( keyword("whitespace" ,action=function(s) s), keyword("symbolic" ,action=function(s) s), keyword("string" ,action=function(s) s), keyword("numberInteger" ,action=function(s) s), keyword("numberScientific",action=function(s) s), action=function(s) paste(s,"()",sep="")), whitespace(),charParser("?"), action=function(s) s[[4]]) @ Finally, a function is needed to inform where the parser failed to recognise any text. <>= errorFun <- function(strmPosition,h=NULL,type="") { if ( is.null(h) || type != "concatenation" ) print(paste("Error from line:",strmPosition$line, " Caracter:",strmPosition$linePos," Stream Pos:", strmPosition$streamPos, "Type:",type)) else errorFun(h$pos,h$h,h$type) return(list(type=type,pos=strmPosition,h=h)) } @ Some partial tests recognising simple rules: \begin{enumerate} \item <>= stream <- streamParserFromString('program = \'PROGRAM\' ;') cstream <- ebnfRule()(stream) print(cstream[c("status","node")]) @ \item <>= stream <- streamParserFromString('program = \'PROGRAM\' , white_space , identifier , white_space ;') cstream <- ebnfRule()(stream) print(cstream[c("status","node")]) @ \item <>= stream <- streamParserFromString( ' program = \'PROGRAM\' , white_space , identifier , white_space , \'BEGIN\' , white_space , { assignment , ";" , white_space } , \'END.\' ; ') cstream <- ebnfRule()(stream) print(cstream[c("status","node")]) @ \item <>= stream <- streamParserFromString( 'identifier = alphabetic_character , { alphabetic_character | digit } ;') cstream <- ebnfRule()(stream) print(cstream[c("status","node")]) @ \item <>= stream <- streamParserFromString('white_space = ? whitespace ? ;') cstream <- ebnfRule()(stream) print(cstream[c("status","node")]) @ \end{enumerate} Now, the code will parse the whole set of rules from the Wikipedia example. They have been slightly modified in order to simplify programming: <>= stream <- streamParserFromString( ' program = \'PROGRAM\' , white_space , identifier , white_space , \'BEGIN\' , white_space , { assignment , ";" , white_space } , \'END.\' ; identifier = alphabetic_character , { alphabetic_character | digit } ; number = [ "-" ] , digit , { digit } ; assignment = identifier , ":=" , ( number | identifier | string_ ) ; alphabetic_character = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" ; digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ; white_space = ? whitespace ? ; string_ = ? string ?; ') cstream <- gramatica()(stream) print(cstream[c("status")]) @ As a result, we obtain functions in R capable of parsing texts in this grammar. R functions parse/eval are used to apply this parser. %%if ( cstream[[c("status")]] == "ok") { <>= print(cstream[[c("node")]]) eval(parse(text=cstream[[c("node")]])) @ We test some functions created in the previous step: <>= identifier()(streamParserFromString("DEMO1"))$status identifier()(streamParserFromString("A0"))$status keyword(':=')(streamParserFromString(":="))$status number()(streamParserFromString("3"))$status @ And finally, we will check that our parser can parse the program in the Wikipedia example. <>= stream <- streamParserFromString( 'PROGRAM DEMO1 BEGIN A0:=3; B:=45; H:=-100023; C:=A; D123:=B34A; BABOON:=GIRAFFE; TEXT:="Hello world!"; END.') cstream <- program()(stream) if ( cstream$status=="fail" ) errorFun(cstream$node$pos,cstream$node$h,cstream$node$type) else print(cstream[c("status")]) @ \end{document}