一只从百度开始不断搜索的小爬虫,百度搜索爬虫,这是我第三天学pytho
一只从百度开始不断搜索的小爬虫,百度搜索爬虫,这是我第三天学pytho
这是我第三天学python了, 想写一个东西纪念一下吧,于是写了一直爬虫,但是不是好的虫,只能讲网页的关键词存到本地, 但是我觉得基本上算是一只小虫了
文中用到了BeautifulSoup这个库, 目的是处理html文档分析的, 因为我只是提取了title的关键字,所以可以用正则表达式代替,还有一个库是jieba, 这个库是中文分词的作用, 再有一个库是 chardet, 用来判断字符的编码, 本想多线程的, 但是自认为被搞糊涂了,就放弃了
#coding:utf-8import reimport urllibimport urllib2import sysimport timeimport Queue import threadimport threadingimport jiebaimport chardetfrom BeautifulSoup import BeautifulSoup as BSDEEP = 1000LOCK = threading.Lock()PATH = "c:\\\\test\\\\"urlQueue = Queue.Queue()def pachong(): url = 'http://www.baidu.com' return urldef getPageUrl(html): reUrl = re.compile(r'<\\s*[Aa]{1}\\s+[^>]*?[Hh][Rr][Ee][Ff]\\s*=\\s*[\\"\\']?([^>\\"\\']+)[\\"\\']?.*?>') urls = reUrl.findall(html) for url in urls: if len(url) > 10: if url.find('javascript') == -1: urlQueue.put(url)def getContents(url): try: url = urllib2.quote(url.split('#')[0].encode('utf-8'), safe = "%/:=&?~#+!$,;'@()*[]") req = urllib2.urlopen(url) res = req.read() code = chardet.detect(res)['encoding'] #print #print code res = res.decode(str(code), 'ignore') res = res.encode('gb2312', 'ignore') code = chardet.detect(res)['encoding'] #print code #print res return res except urllib2.HTTPError, e: print e.code return None except urllib2.URLError, e: print str(e) return Nonedef writeToFile(html, url): fp = file(PATH + str(time.time()) + '.html', 'w') fp.write(html) fp.close()def getKeyWords(html): code = chardet.detect(html)['encoding'] if code == 'ISO-8859-2': html.decode('gbk', 'ignore').encode('gb2312', 'ignore') code = chardet.detect(html)['encoding'] soup = BS(html, fromEncoding="gb2312") titleTag = soup.title titleKeyWords = titleTag.contents[0] cutWords(titleKeyWords)def cutWords(contents): print contents res = jieba.cut_for_search(contents) res = '\\n'.join(res) print res res = res.encode('gb2312') keyWords = file(PATH + 'cutKeyWors.txt', 'a') keyWords.write(res) keyWords.close()def start(): while urlQueue.empty() == False: url = urlQueue.get() html = getContents(url) getPageUrl(html) getKeyWords(html) #writeToFile(html, url)if __name__ == '__main__': startUrl = pachong() urlQueue.put(startUrl) start()#该片段来自于http://byrx.net
评论关闭