爬取中图分类法,爬取图分类法,# -*- coding
# -*- coding: utf-8 -*-import urllib2import reimport timeclass Test: ''' classdocs ''' index = 0; def GetBookType(self, url, pId): res = urllib2.urlopen(url) html = res.read().decode('UTF-8') ul = re.findall(u'<ul id="list" class="cent" style="list-style:none;">([\\w\\W]*?)</ul>', html, re.MULTILINE | re.DOTALL) if len(ul)==1: lis = re.findall(u'<li>([\\w\\W]*?)</li>', ul[0], re.MULTILINE | re.DOTALL) for li in lis : span = re.findall(u'<span class="code">([\\w\\W]*?)</span>', li, re.MULTILINE | re.DOTALL) aHref = re.findall(u'<a href="([\\w\\W]*?)"', li, re.MULTILINE | re.DOTALL) aText = re.findall(u'">([\\w\\W]*?)</a>', li.replace('<span class="code">',''), re.MULTILINE | re.DOTALL) self.index = self.index+1 sql = u"insert into booktypeinfo(id, paterbooktype_id, code, name) values (%s, %s, '%s', '%s');" id = self.index paterbooktype_id = 'null' if pId == 0: paterbooktype_id = 'null' else: paterbooktype_id = pId code = span[0] name = aText[0] print sql%(id, paterbooktype_id, code, name) self.GetBookType(aHref[0], id) else: returnt = Test()t.GetBookType('http://byrx.net 0)#该片段来自于http://byrx.net
评论关闭