查询关键词在百度排名python脚本分享,python脚本,如下脚本使用urllib


如下脚本使用urllib和urllib2以及re正则表达式模块实现查询某个关键词在指定站点的百度排名。

# -*- coding: utf-8 -*-#encoding = utf-8import urllib2import urllibimport refrom urllib import quote_plusfrom urlparse import urlparsedef get_site_word_baidu_rank(siteHost,word,maxScanPageNumber = 10,printSearchLog=False):    def printLog(log):        if printSearchLog:            print log    page = 1    pageSize = 10    siteHost = siteHost.lower()    number = 0    got = False    gotUrl = None    searchUrl = None    while True:        if page == maxScanPageNumber: break        searchUrl = 'http://www.baidu.com/s?wd='+quote_plus(word)+'&pn='+str((page-1)*pageSize)+'&tn=baiduhome_pg&ie=utf-8&usm=2'        printLog('搜索第%d页' % (page,))        data = urllib.urlopen(searchUrl)        html = data.read()        itemPattern = re.compile('<h3 class="t"><a[\s]+data-click="[^"]+"  href="(?P<url>[^"]+)".*?<span class="g">(?P<urldate>[^<]+)</span>')        matches = itemPattern.finditer(html)        number = 0        for m in matches:            number += 1            urldate = m.group('urldate').strip()            siteUrl = urldate[0:urldate.find(' ')]            itemUrl = '%s%s' % ('http://',siteUrl)            urlObject = urlparse(itemUrl)            if urlObject.netloc.find(':') == -1:host = urlObject.netloc                            else :host = urlObject.netloc[0:urlObject.netloc.find(':')]            if host.lower() == siteHost or host.lower().find('.' + siteHost) > -1:                gotUrl = m.group('url')                                realUrlFile = urllib2.urlopen(gotUrl)                 gotUrl = realUrlFile.geturl()                got = True                break        if got:break         page += 1    if got:        number = (page-1) * pageSize + number        return (number,page,gotUrl,searchUrl)    return Noneif __name__ == '__main__':    words = ('程序员','内存溢出','Outofmemory','python','java')    siteHost = 'byrx.net'    for w in words:        result = get_site_word_baidu_rank(siteHost,w,10)        if result:            print w + ':你的网站排在第%d位,在第%d页,排上的链接是%s,搜索页地址%s'%result        else:            print '未找到记录'

评论关闭