Support Forums

Full Version: Custom Parser Help Needed!
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
Hi, I'm trying to build a special parser in Python.
All this parser does, is take source code in a scripting language of my invention, and parser it into another scripting language, called JASS.

This parser has to be able to parse only between two specific keywords, so it can remain compatible with straight JASS, and other scripting languages.
It also needs to be able to search for certain blocks inside of the source script, and if it finds them, merges them with the very first definition of this specific block.
And finally, I need to be able to move several function/method declarations around based on the current namepace they're in, and whether functions in that namespace call functions in another (so if the namespace A calls a function in the namespace B, A goes before B to make sure the function can be called).

Here's a small example of what the two languages look like:

Code:
// before
library TestCode
    integer myInt = 5

    function myFunc()
        BJDebugMsg("Hello World");

// after

globals
    integer myInt
endglobals

function myFunc takes nothing returns nothing
    call BJDebugMsg("Hello World")
endfunction

That's just the basics. This has a lot more features (preprocessing commands, OOP-implementation, etc)
Does anyone have any kind of information, or links to (well documented) tools that can help me?
Hmmm I do remember seeing something in my book about creating a custom parser, but I'm not quite there yet. If you are familiar with python you could take a look at these examples from my book.
parser1.py:
Code:
# the parser (syntax analyser, evaluates during parse)

########################################################

    

class UndefinedError(Exception): pass

from scanner import Scanner, LexicalError, SyntaxError

    

class Parser:

    def __init__(self, text=''):

        self.lex  = Scanner(text)              # embed a scanner

        self.vars = {'pi':3.14159}             # add a variable

    

    def parse(self, *text):

        if text:                               # main entry-point

            self.lex.newtext(text[0])          # reuse this parser?

        try:

            self.lex.scan()                    # get first token

            self.Goal()                        # parse a sentence

        except SyntaxError:

            print 'Syntax Error at column:', self.lex.start

            self.lex.showerror()

        except LexicalError:

            print 'Lexical Error at column:', self.lex.start

            self.lex.showerror()

        except UndefinedError, name:

            print "'%s' is undefined at column:" % name, self.lex.start

            self.lex.showerror()

    

    def Goal(self):

        if self.lex.token in ['num', 'var', '(']:

            val = self.Expr()

            self.lex.match('\0')                    # expression?

            print val

        elif self.lex.token == 'set':               # set command?

            self.Assign()          

            self.lex.match('\0')

        else:

            raise SyntaxError

    

    def Assign(self):

        self.lex.match('set')

        var = self.lex.match('var')

        val = self.Expr()

        self.vars[var] = val           # assign name in dict

    

    def Expr(self):

        left = self.Factor()

        while 1:

            if self.lex.token in ['\0', ')']:

                return left

            elif self.lex.token == '+':

                self.lex.scan()

                left = left + self.Factor()

            elif self.lex.token == '-':

                self.lex.scan()

                left = left - self.Factor()

            else:

                raise SyntaxError

    

    def Factor(self):

        left = self.Term()

        while 1:

            if self.lex.token in ['+', '-', '\0', ')']:

                return left

            elif self.lex.token == '*':

                self.lex.scan()

                left = left * self.Term()

            elif self.lex.token == '/':

                self.lex.scan()

                left = left / self.Term()

            else:

                raise SyntaxError

    

    def Term(self):

        if self.lex.token == 'num':

            val = self.lex.match('num')               # numbers

            return val

        elif self.lex.token == 'var':

            if self.vars.has_key(self.lex.value):

                val = self.vars[self.lex.value]       # lookup name's value

                self.lex.scan()

                return val

            else:

                raise UndefinedError, self.lex.value

        elif self.lex.token == '(':

            self.lex.scan()

            val = self.Expr()                         # sub-expression

            self.lex.match(')')

            return val

        else:

            raise SyntaxError

                

if __name__ == '__main__':

    import testparser                       # self-test code

    testparser.test(Parser, 'parser1')      # test local Parser
parser2.py
Code:
TraceDefault = False

class UndefinedError(Exception): pass

from scanner import Scanner, SyntaxError, LexicalError

    

    

####################################################

# the interpreter (a smart objects tree)

####################################################

    

class TreeNode:

    def validate(self, dict):           # default error check

        pass

    def apply(self, dict):              # default evaluator

        pass            

    def trace(self, level):             # default unparser

        print '.'*level + '<empty>'



# ROOTS

    

class BinaryNode(TreeNode):

    def __init__(self, left, right):            # inherited methods

        self.left, self.right = left, right     # left/right branches

    def validate(self, dict):                

        self.left.validate(dict)                # recurse down branches

        self.right.validate(dict)

    def trace(self, level):

        print '.'*level + '[' + self.label + ']'

        self.left.trace(level+3)

        self.right.trace(level+3)

    

class TimesNode(BinaryNode):

    label = '*'

    def apply(self, dict):

        return self.left.apply(dict) * self.right.apply(dict)

    

class DivideNode(BinaryNode):

    label = '/'

    def apply(self, dict):

        return self.left.apply(dict) / self.right.apply(dict)

    

class PlusNode(BinaryNode):

    label = '+'

    def apply(self, dict):

        return self.left.apply(dict) + self.right.apply(dict)

    

class MinusNode(BinaryNode):

    label = '-'

    def apply(self, dict):

        return self.left.apply(dict) - self.right.apply(dict)

    

# LEAVES

    

class NumNode(TreeNode):

    def __init__(self, num):

        self.num = num                 # already numeric

    def apply(self, dict):             # use default validate

        return self.num

    def trace(self, level):

        print '.'*level + repr(self.num)      # as code, was `self.num`

    

class VarNode(TreeNode):

    def __init__(self, text, start):

        self.name   = text                    # variable name

        self.column = start                   # column for errors

    def validate(self, dict):

        if not dict.has_key(self.name):

            raise UndefinedError, (self.name, self.column)

    def apply(self, dict):

        return dict[self.name]                # validate before apply

    def assign(self, value, dict):

        dict[self.name] = value               # local extension

    def trace(self, level):

        print '.'*level + self.name

    

# COMPOSITES

    

class AssignNode(TreeNode):

    def __init__(self, var, val):

        self.var, self.val = var, val

    def validate(self, dict):

        self.val.validate(dict)               # don't validate var

    def apply(self, dict):

        self.var.assign( self.val.apply(dict), dict )

    def trace(self, level):

        print '.'*level + 'set '

        self.var.trace(level + 3)

        self.val.trace(level + 3)



    

####################################################

# the parser (syntax analyser, tree builder)

####################################################

    

class Parser:

    def __init__(self, text=''):

        self.lex     = Scanner(text)           # make a scanner

        self.vars    = {'pi':3.14159}          # add constants

        self.traceme = TraceDefault



    def parse(self, *text):                    # external interface

        if text:

            self.lex.newtext(text[0])          # reuse with new text

        tree = self.analyse()                  # parse string

        if tree:

            if self.traceme:                   # dump parse-tree?

                print; tree.trace(0)

            if self.errorCheck(tree):          # check names

                self.interpret(tree)           # evaluate tree

    

    def analyse(self):

        try:

            self.lex.scan()                    # get first token

            return self.Goal()                 # build a parse-tree

        except SyntaxError:

            print 'Syntax Error at column:', self.lex.start

            self.lex.showerror()

        except LexicalError:

            print 'Lexical Error at column:', self.lex.start

            self.lex.showerror()

    

    def errorCheck(self, tree):

        try:

            tree.validate(self.vars)           # error checker

            return 'ok'

        except UndefinedError, instance:       # args is a tuple

            varinfo = instance.args            # instance is a sequence

            print "'%s' is undefined at column: %d" % varinfo

            self.lex.start = varinfo[1]

            self.lex.showerror()               # returns None

    

    def interpret(self, tree):

        result = tree.apply(self.vars)         # tree evals itself

        if result != None:                     # ignore 'set' result

            print result

    

    def Goal(self):

        if self.lex.token in ['num', 'var', '(']:

            tree = self.Expr()

            self.lex.match('\0')

            return tree

        elif self.lex.token == 'set':

            tree = self.Assign()          

            self.lex.match('\0')

            return tree

        else:

            raise SyntaxError

    

    def Assign(self):

        self.lex.match('set')

        vartree = VarNode(self.lex.value, self.lex.start)

        self.lex.match('var')

        valtree = self.Expr()

        return AssignNode(vartree, valtree)               # two subtrees

    

    def Expr(self):

        left = self.Factor()                              # left subtree

        while 1:

            if self.lex.token in ['\0', ')']:

                return left

            elif self.lex.token == '+':

                self.lex.scan()

                left = PlusNode(left, self.Factor())      # add root-node

            elif self.lex.token == '-':

                self.lex.scan()

                left = MinusNode(left, self.Factor())     # grows up/right

            else:

                raise SyntaxError

    

    def Factor(self):

        left = self.Term()

        while 1:

            if self.lex.token in ['+', '-', '\0', ')']:

                return left

            elif self.lex.token == '*':

                self.lex.scan()

                left = TimesNode(left, self.Term())

            elif self.lex.token == '/':

                self.lex.scan()

                left = DivideNode(left, self.Term())

            else:

                raise SyntaxError

    

    def Term(self):

        if self.lex.token == 'num':

            leaf = NumNode(self.lex.match('num'))

            return leaf

        elif self.lex.token == 'var':

            leaf = VarNode(self.lex.value, self.lex.start)

            self.lex.scan()

            return leaf

        elif self.lex.token == '(':

            self.lex.scan()

            tree = self.Expr()

            self.lex.match(')')

            return tree

        else:

            raise SyntaxError

                

    

####################################################

# self-test code: use my parser, parser1's tester

####################################################

    

if __name__ == '__main__':

    import testparser

    testparser.test(Parser, 'parser2')    #  run with Parser class here
scanner.py:
Code:
####################################################

# the scanner (lexical analyser)

####################################################

    

import string

class SyntaxError(Exception): pass           # local errors

class LexicalError(Exception): pass          # used to be strings

    

class Scanner:

    def __init__(self, text):

        self.next = 0

        self.text = text + '\0'        

    

    def newtext(self, text):

        Scanner.__init__(self, text)

    

    def showerror(self):

        print '=> ', self.text

        print '=> ', (' ' * self.start) + '^'

        

    def match(self, token):

        if self.token != token:

            raise SyntaxError, [token]

        else:

            value = self.value

            if self.token != '\0':

                self.scan()                  # next token/value

            return value                     # return prior value

    

    def scan(self):

        self.value = None

        ix = self.next

        while self.text[ix] in string.whitespace:

            ix = ix+1

        self.start = ix

    

        if self.text[ix] in ['(', ')', '-', '+', '/', '*', '\0']:

            self.token = self.text[ix]

            ix = ix+1

    

        elif self.text[ix] in string.digits:

            str = ''

            while self.text[ix] in string.digits:

               str = str + self.text[ix]

               ix = ix+1

            if self.text[ix] == '.':

                str = str + '.'

                ix = ix+1

                while self.text[ix] in string.digits:

                   str = str + self.text[ix]

                   ix = ix+1

                self.token = 'num'

                self.value = float(str)

            else:

                self.token = 'num'

                self.value = long(str)

    

        elif self.text[ix] in string.letters:

            str = ''

            while self.text[ix] in (string.digits + string.letters):

                str = str + self.text[ix]

                ix = ix+1

            if str.lower() == 'set':

                self.token = 'set'

            else:

                self.token = 'var'

                self.value = str  

    

        else:

            raise LexicalError

        self.next = ix
testparser.py
Code:
####################################################

# parser test code

####################################################

    

def test(ParserClass, msg):

    print msg, ParserClass

    x = ParserClass('4 / 2 + 3')            # allow different Parser's

    x.parse()

    

    x.parse('3 + 4 / 2')                    # like eval('3 + 4 / 2')...

    x.parse('(3 + 4) / 2')

    x.parse('4 / (2 + 3)')

    x.parse('4.0 / (2 + 3)')

    x.parse('4 / (2.0 + 3)')

    x.parse('4.0 / 2 * 3')

    x.parse('(4.0 / 2) * 3')

    x.parse('4.0 / (2 * 3)')

    x.parse('(((3))) + 1')

    

    y = ParserClass()

    y.parse('set a 4 / 2 + 1')

    y.parse('a * 3')

    y.parse('set b 12 / a')

    y.parse('b')

    

    z = ParserClass()

    z.parse('set a 99')

    z.parse('set a a + 1')

    z.parse('a')

    

    z = ParserClass()

    z.parse('pi')

    z.parse('2 * pi')

    z.parse('1.234 + 2.1')

    

def interact(ParserClass):                     # command-line entry

    print ParserClass

    x = ParserClass()

    while 1:

        cmd = raw_input('Enter=> ')

        if cmd == 'stop':

            break

        x.parse(cmd)
grammar.txt:
Code:
goal -> <expr> END                       [number, variable, ( ]

goal -> <assign> END                     [set]



assign -> 'set' <variable> <expr>        [set]

    

expr -> <factor> <expr-tail>             [number, variable, ( ]



expr-tail -> ^                           [END, ) ]

expr-tail -> '+' <factor> <expr-tail>    [+]

expr-tail -> '-' <factor> <expr-tail>    [-]

    

factor -> <term> <factor-tail>           [number, variable, ( ]

    

factor-tail -> ^                         [+, -, END, ) ]

factor-tail -> '*' <term> <factor-tail>  [*]

factor-tail -> '/' <term> <factor-tail>  [/]



term -> <number>                         [number]

term -> <variable>                       [variable]

term -> '(' <expr> ')'                   [(]

    

tokens: (, ), num, var, -, +, /, *, set, end
__init__.py:
Code:
#empty
Sorry if I'm not helping, I'm not quite that advanced yet.

Alright, thanks. I'll have a look at those now.