爬取中图分类法,爬取图分类法,# -*- coding


# -*- coding: utf-8 -*-import urllib2import reimport timeclass Test:    '''    classdocs    '''    index = 0;    def GetBookType(self, url, pId):        res = urllib2.urlopen(url)        html = res.read().decode('UTF-8')        ul = re.findall(u'<ul id="list" class="cent" style="list-style:none;">([\\w\\W]*?)</ul>', html,  re.MULTILINE | re.DOTALL)        if len(ul)==1:            lis = re.findall(u'<li>([\\w\\W]*?)</li>', ul[0], re.MULTILINE | re.DOTALL)            for li in lis :                span = re.findall(u'<span class="code">([\\w\\W]*?)</span>', li, re.MULTILINE | re.DOTALL)                aHref = re.findall(u'<a href="([\\w\\W]*?)"', li, re.MULTILINE | re.DOTALL)                aText = re.findall(u'">([\\w\\W]*?)</a>', li.replace('<span class="code">',''), re.MULTILINE | re.DOTALL)                self.index = self.index+1                sql = u"insert into booktypeinfo(id, paterbooktype_id, code, name) values (%s, %s, '%s', '%s');"                id = self.index                paterbooktype_id = 'null'                if pId == 0:                    paterbooktype_id = 'null'                else:                    paterbooktype_id = pId                code = span[0]                name = aText[0]                print  sql%(id, paterbooktype_id, code, name)                self.GetBookType(aHref[0], id)        else:            returnt = Test()t.GetBookType('http://byrx.net 0)#该片段来自于http://byrx.net

评论关闭