抓取指定网页以及该网页上所有链接,抓取指定网页链接,#!/usr/bin/e


#!/usr/bin/env python# -*- coding: utf-8 -*-# ****************************************************************************# Copyright (C) 2010 yangyingchao@gmail.com# Author: yangyingchao <yangyingchao@gmail.com># This program is free software; you can redistribute it and/or modify it# under the terms of the GNU General Public License as published by the Free# Software Foundation; either version 2, or (at your option) any later# version.# This program is distributed in the hope that it will be useful, but WITHOUT# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for# more details.# You should have received a copy of the GNU General Public License along with# GNU Emacs; see the file COPYING.  If not, write to the Free Software# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.# ****************************************************************************from copy import deepcopyfrom sgmllib import SGMLParserfrom xml.dom.minidom import *import osimport reimport sysimport urllib2title = "Untitled"class MyParser(SGMLParser):    def __init__(self):        self.data = ""        self.links = []        self.TAG_BEG = False        self.TAG_END = False        SGMLParser.__init__(self, 0)    def handle_data(self, data):        if (self.TAG_BEG is True) and (self.TAG_END is False):            self.data += data        pass    def start_title(self, attrs):        self.link = ""        self.data=""        self.TAG_BEG = True        self.TAG_END = False        for (key, val) in attrs:            if key == "href":                self.link = val    def end_title(self):        self.TAG_BEG = False        self.TAG_END = True        self.title = self.data.strip()    def flush(self):        pass    def handle_comment(self, data):        pass    def start_a(self, attrs):        self.data=""        self.TAG_BEG = True        self.TAG_END = False        for (key, val) in attrs:            if key == "href":                self.link = val    def end_a(self):        self.TAG_BEG = False        self.TAG_END = True        tmp = {}        tmp["name"] = self.data        tmp["link"] = self.link        self.links.append(deepcopy(tmp))    def unknown_starttag(self, tag, attrs):        pass    def unknown_endtag(self, tag):        pass    def unknown_entityref(self, ref):        pass    def unknown_charref(self, ref):        pass    def unknown_decl(self, data):        pass    def close(self):        SGMLParser.close(self)        self.flush()def lst2str(lst):    string = ""    for item in lst:        string += item.strip()+ "\n"    return stringdef downURL(url, filename):    print "Download %s, save as %s"%(url, filename)    try:        fp = urllib2.urlopen(url)    except:        print "download exception"        print sys.exc_info()        return 0    op = open(filename, "wb")    while 1:        s = fp.read()        if not s:            break        op.write(s)    fp.close( )    op.close( )    return 1def reptile(base_url):    """    Download all articles from base_url.    Arguments:    - `base_url`: Url of website.    """    page_list = []    if not len(base_url):        print "No page to reptile!"        sys.exit(1)    parser = MyParser()    if base_url.startswith("http"):        myopen = urllib2.urlopen    else:        myopen = open    try:        content = myopen(base_url).read()    except:        print "Failed to read from %s."%base_url        print sys.exc_info()    for item in content:        parser.feed(item)    for tmp in parser.links:        page_list.append(tmp.get("link"))    global title    title = parser.title    parser.close()    item_list = list(set(page_list))    for item in item_list:        # Strip '#' from url.        pos = item.find('#')        if pos != -1:            item = item[:pos]        # Added base_url to item if necessary        if not item.startswith("http"):            item = base_url.rstrip("/")+"/"+item            pass        local_file = item.split("/")[-1]        print item, local_file        if not local_file:            print "Empty local file! Continue from next one!"            continue        if os.access(local_file, os.F_OK):            print "File: %s existed, skip ..."%local_file        else:            ret = downURL(item, local_file)    # Remember to download the index file!    downURL(base_url, "index.html")    print "Total: %d articles."%(len(item_list))    passdef walk_dir(lst, dirname, filenames):    for filename in filenames:        fn = os.path.join(dirname, filename)        if os.path.isdir(fn) or \               not filename.endswith("html"):            continue        print "Processing: %s"%fn        tmp = {}        parser = MyParser()        content = open(fn).read()        for item in content:            parser.feed(item)        tmp["file"] = filename        tmp["title"] = parser.title        parser.close()        lst.append(deepcopy(tmp))    passdef gen_index():    """    Generate index of all htmls in this directory.    """    file_lists = []    os.path.walk(".", walk_dir, file_lists)    fp = open("%s.devhelp2"%os.path.basename(os.getcwd()), "w")    string = '<?xml version="1.0" encoding="utf-8"?>\n<book author=""' +\        ' language="c" link="index.html" name="" title="%s"'%title+\        ' version="2" xmlns="http://www.devhelp.net/book">\n  <chapters>'    for item in file_lists:        link = item.get("file")        try:            name =item.get("title").decode('gbk').encode('utf-8')        except:            name = item.get("title")        finally:            string += '<sub link="%s" name="%s"/>\n'%(link, name)    string +=   '\n</chapters>\n   </book>\n'    fp.write(string)    fp.close()if __name__ == '__main__':    if len(sys.argv) != 2:        print "Usage: %s url of baidu space"%sys.argv[0]        print "Such as: %s http://hi.baidu.com/Username"        gen_index()        sys.exit(1)    base_url = sys.argv[1]    reptile (base_url)    gen_index()

评论关闭