python 检测关键词在百度的排名,python关键词,python检测网站某关


python检测网站某关键词的在百度的排名位置

要检测网站关键词的在百度搜索中的排名,我的实现思路是:1、抓取搜索结果过滤出URL,2、然后判断网站的域名是否出现在URL序列中,有则返回,无则递归进入一页;3、除非有出现网站域名,否则直至到最后一页!

#coding=utf-8import urllib,reclass Client:    def __init__(self,baiduUrl):        self.headers = [            ("User-agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)")        ]        self.baiduUrl = baiduUrl        self.searchResult = self.__requestBaidu()        self.urlList = self.__getUrls()        self.ranking = 1    #排名    def __requestBaidu(self):        """抓取页面内容"""        opener = urllib.FancyURLopener()        opener.addheaders = self.headers        return opener.open(self.baiduUrl).read()    def isLastPage(self):        """当前页是否最后一页"""        if not "下一页" in self.searchResult:            return True    def isFoundMysite(self,mySite):        """目标网址是否出现在获取的URL中"""        for url in self.urlList:            self.ranking += 1   #当前页的排名            if mySite in url:                return True    def __getUrls(self):        """返回获取的URL列表"""        patten = re.compile(r'href="(.+?)"')        urlList = patten.findall(self.searchResult)        urlList = filter(self.__isLegalUrl,urlList)        return urlList    def __isLegalUrl(self,url):        """过滤一部分不是搜索结果的URL"""        m1 = re.search("^http:\/\/",url)#http开头的        m2 = not re.search("^s\?",url)  #不是分页的        m3 = not re.search('baidu.com',url)#不是百度的        if m1 and m2 and m3:            return Truedef getRank(mySite,keyword,pn):    """查询网站的关键词在百度的排名位置    """    curPage = pn/10+1    print "page %s..."% curPage    params = {"wd":keyword, "pn":pn}    baiduUrl = "http://www.baidu.com/s?%s"% urllib.urlencode(params)    baidu = Client(baiduUrl)    if baidu.isFoundMysite(mySite):        return (curPage,baidu.ranking,baiduUrl)    else:        if baidu.isLastPage():            print "The last page"            return        else:            #递归,进入下一页            return getRank(mySite,keyword,pn+10)def run():    mySite = raw_input("Enter your domain: ")    while True:        keyword = raw_input("Enter your keyword: ")        rank = getRank(mySite,keyword,0)        if rank:            print "Found in page %s, ranking:%s \n%s\n"% (rank)        else:            print "All not found"if __name__ == "__main__":    run()

查询结果

Enter your domain: byrx.netI am worker mainstaring..024681012141618end.

评论关闭