正则分词,分词,def tokenize


def tokenize(s):    keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'}    token_specification = [        ('NUMBER',  r'\\d+(\\.\\d*)?'), # Integer or decimal number        ('ASSIGN',  r':='),          # Assignment operator        ('END',     r';'),           # Statement terminator        ('ID',      r'[A-Za-z]+'),   # Identifiers        ('OP',      r'[+*\\/\\-]'),    # Arithmetic operators        ('NEWLINE', r'\\n'),          # Line endings        ('SKIP',    r'[ \\t]'),       # Skip over spaces and tabs    ]    tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)    get_token = re.compile(tok_regex).match    line = 1    pos = line_start = 0    mo = get_token(s)    print(mo.end())    print(mo.lastgroup)    print(mo.lastindex)    print(mo.groupdict())    while mo is not None:        typ = mo.lastgroup        if typ == 'NEWLINE':            line_start = pos            line += 1        elif typ != 'SKIP':            val = mo.group(typ)            if typ == 'ID' and val in keywords:                typ = val            yield Token(typ, val, line, mo.start()-line_start)        pos = mo.end()        mo = get_token(s, pos)    if pos != len(s):        raise RuntimeError('Unexpected character %r on line %d' %(s[pos], line))statements = ''':=     IF quantity THEN        total := total + price * quantity;        tax := price * 0.05;    ENDIF;'''for token in tokenize(statements):    print(token)#该片段来自于http://byrx.net

评论关闭