python精简版搜索引擎,python精简版,from html.pa


from html.parser import HTMLParser  from urllib.request import urlopen  from urllib import parseclass LinkParser(HTMLParser):    def handle_starttag(self, tag, attrs):        if tag == 'a':            for (key, value) in attrs:                if key == 'href':                    newUrl = parse.urljoin(self.baseUrl, value)                    self.links = self.links + [newUrl]    def getLinks(self, url):        self.links = []        self.baseUrl = url        response = urlopen(url)        if response.getheader('Content-Type')=='text/html; charset=UTF-8':            htmlBytes = response.read()            htmlString = htmlBytes.decode("utf-8")            self.feed(htmlString)            return htmlString, self.links        else:            return "",[]def spider(url, word, maxPages):      pagesToVisit = [url]    numberVisited = 0    foundWord = 4    while numberVisited < maxPages and pagesToVisit != [] and not foundWord:        numberVisited = numberVisited +1        url = pagesToVisit[0]        pagesToVisit = pagesToVisit[1:]        try:            print(numberVisited, "搜索页:", url)            parser = LinkParser()            data, links = parser.getLinks(url)            #print("data:",links)            pagesToVisit = pagesToVisit + links            if data.find(word)>-1:                foundWord = True                pagesToVisit = pagesToVisit + links                print(" **成功!**")        except:            print(" **错误!**")    if foundWord:        print("该关键字", word, "搜索失败", url)    else:        print("没有找到任何有关的网页")spider("http://yuedu.fm/","夏洛特",100)

评论关闭