Python通过正则表达式选取callback,pythoncallback,最近在瞎想怎么通过xpa


最近在瞎想怎么通过xpath去精确抓取文章的正文,跟parselets类似的想法,只不过更简单。

代码设计上采用正则表达式匹配URL,再选择callback handler的方式,主要参考web.py的分发器(Dispatcher)。

当然,这个实现比较老土一些,全部用function的方式回调,没有用类,可以参考web.py去做适当的修改。

#!/bin/env pythonimport re, sys# Define parser first.def baidu(username):    # Business logic    return "Using parser Baidu. and the user's name is: %s." % usernamedef qzone(uin):    # Business logic    return "Using parser Qzone, and the user's QQ is: %s." % uin# From web.pydef group(seq, size):#{{{    """    Returns an iterator over a series of lists of length size from iterable.        >>> list(group([1,2,3,4], 2))        [[1, 2], [3, 4]]        >>> list(group([1,2,3,4,5], 2))        [[1, 2], [3, 4], [5]]    """    def take(seq, n):        for i in xrange(n):            yield seq.next()    if not hasattr(seq, 'next'):        seq = iter(seq)    while True:        x = list(take(seq, size))        if x:            yield x        else:            break#}}}def parser_init(url,mapping):    for pat, what in group(mapping,2):        result = re.compile('^' + pat + '$').match(url)        if result:            return what, [x for x in result.groups()]    return None, Noneif __name__ == '__main__':    mapping = (            '<a href="http://">http://(?:hi|space).baidu.com/([^/]+)(?:/.*)?','baidu',            '<a href="http://">http://(\\d+).qzone.qq.com(?:/.*)?','qzone',            )    (func, args) = parser_init(sys.argv[1],mapping)    if func:        callback = func        if func in globals():            callback = globals()[func]        if callable(callback):            print callback(*args)    else:        print 'No parser found.';#该片段来自于http://byrx.net

评论关闭