用python编写网页爬虫


     一、前期准备

        为了完成一个网页爬虫的小程序,需要有以下准备:

        1 了解基本的http协议

        2 urllib2库接口熟悉

        3 熟悉python正则表达式

       

        二、程序编写思路

        这里只是实现一个基本的网页爬虫程序,它的基本思路如下:

        1 寻找到需要抓取的网页,查看其源代码,分析需要获取网页页面的html规律

        2 利用urllib2库来读取所需要的网页

        3 利用正则表达式正确提取出所需要的网页信息

        4 对获取的网页数据信息进行有效性校验工作,即信息的甄别

        5 对有效的数据信息进行保存,如存放于文件,或数据库db中

 


        三、抓取网页代理服务器爬虫的实例

        找到一个代理服务器的网站,如
我们需要将http://www.cnproxy.com/proxy1.html下的IP,端口,type和area字段提取出来。主要利用正则表达式和urllib2库,代码如下:

       

[python]
proxylist1=Queue.Queue()#用于存放从页面中抓取的所有服务器代理IP地址和端口号  
portdicts = {'z':"3", 'm':"4", 'a':"2", 'l':"9", 'f':"0", 'b':"5", 'i':"7", 'w':"6", 'x':"8", 'c':"1", 'r':"8", 'd':"0"} 
def get_proxy_from_cnproxy(): 
    global proxylist1 
 
    p = re.compile(r'''''<tr><td>(.+?)<SCRIPT type=text/javascript>document.write\(":"\+(.+?)\)</SCRIPT></td><td>(.+?)</td><td>.+?</td><td>(.+?)</td></tr>''') 
     
     
    for i in range(1,11): 
        target = r"http://www.cnproxy.com/proxy%d.html" %i 
        print target 
        req = urllib2.urlopen(target) 
        result =  req.read() 
        matchs = p.findall(result) 
        #print matchs  
         
        for row in matchs: 
            ip = row[0] 
            port = row[1] 
            if port is None: 
                continue 
            tmp = port.split('+') 
             
            #发现在html中的端口找不到一些键所对应的值,过滤掉这些IP  
            flag = 0 
            for x in tmp: 
                if x  not in portdicts: 
                    flag = 1 
                    break 
            if flag == 1: 
                continue 
             
            port = map(lambda x: portdicts[x], port.split('+')) 
            port = ''.join(port) 
            agent = row[2] 
            addr = row[3].decode("cp936").encode("utf-8") 
         
            l = [ip, port, agent, addr] 
            print l 
            proxylist1.put(l) 
         
    print "page 1-10 size:%s nums proxy info" %proxylist1.qsize()   

proxylist1=Queue.Queue()#用于存放从页面中抓取的所有服务器代理IP地址和端口号
portdicts = {'z':"3", 'm':"4", 'a':"2", 'l':"9", 'f':"0", 'b':"5", 'i':"7", 'w':"6", 'x':"8", 'c':"1", 'r':"8", 'd':"0"}
def get_proxy_from_cnproxy():
    global proxylist1

    p = re.compile(r'''<tr><td>(.+?)<SCRIPT type=text/javascript>document.write\(":"\+(.+?)\)</SCRIPT></td><td>(.+?)</td><td>.+?</td><td>(.+?)</td></tr>''')
   
   
    for i in range(1,11):
        target = r"http://www.cnproxy.com/proxy%d.html" %i
        print target
        req = urllib2.urlopen(target)
        result =  req.read()
        matchs = p.findall(result)
        #print matchs
       
        for row in matchs:
            ip = row[0]
            port = row[1]
            if port is None:
                continue
            tmp = port.split('+')
           
            #发现在html中的端口找不到一些键所对应的值,过滤掉这些IP
            flag = 0
            for x in tmp:
                if x  not in portdicts:
                    flag = 1
                    break
            if flag == 1:
                continue
           
            port = map(lambda x: portdicts[x], port.split('+'))
            port = ''.join(port)
            agent = row[2]
            addr = row[3].decode("cp936").encode("utf-8")
       
            l = [ip, port, agent, addr]
            print l
            proxylist1.put(l)
       
    print "page 1-10 size:%s nums proxy info" %proxylist1.qsize()          

        上述代码完成将所需要的网页字段提取出来存放到队列proxylist1中。紧接着需要对队列proxylist1中的每个字段进行校验,判断里面数据的有效性,然后将检测到的数据存放于另一队列ProxyCheckedList中,然后对有效的数据信息进行一个排序,最后将其保存于文件中。代码如下:


[python]
ProxyCheckedList = Queue.Queue()#用于存放经校验后得到的有效代理IP地址和端口号信息          
class ProxyCheck(threading.Thread): 
    def __init__(self, fname): 
        threading.Thread.__init__(self) 
        self.timeout = 5 
        #self.test_url = "http://www.baidu.com/"  
        #self.test_str = "030173"  
        #self.test_url = "http://www.so.com/"  
        #self.test_str = '08010314'  
        self.test_url="http://www.renren.com" 
        self.test_str="110000000009" 
        self.fname = fname 
        self.checkedProxyList = [] 
         
    def checkProxy(self): 
        threadpool = [] 
        for i in range(10):#开辟10个线程放到线程池中  
            threadpool.append(ck_process(self.test_url, self.test_str,self.timeout,i)) 
         
        #启动10个线程同时处理校验工作      
        map(lambda x: x.start(), threadpool) 
         
        #等待线程退出  
        map(lambda x: x.join(), threadpool) 
         
        while ProxyCheckedList.empty() == False: 
            try: 
                content = ProxyCheckedList.get_nowait() 
            except Exception,e: 
                print e 
            else: 
                self.checkedProxyList.append(content) 
        print "the checked proxylist contains: %s nums records"%len(self.checkedProxyList) 
        for info in self.checkedProxyList: 
            print info 
         
    def sort(self): 
        sorted(self.checkedProxyList, cmp=lambda x,y:cmp(x[4], y[4]))#对响应时间的代理ip列表进行排序  
     
    def save(self): 
        f = open(self.fname, 'w+') 
        for proxy in self.checkedProxyList: 
            f.write("%s:%s\t%s\t%s\t%s\n"%(proxy[0], proxy[1], proxy[2], proxy[3], proxy[4])) 
        f.close() 
     
    def run(self): 
        self.checkProxy() 
        self.sort() 
        self.save() 
        print 'Done' 

ProxyCheckedList = Queue.Queue()#用于存放经校验后得到的有效代理IP地址和端口号信息       
class ProxyCheck(threading.Thread):
    def __init__(self, fname):
        threading.Thread.__init__(self)
        self.timeout = 5
        #self.test_url = "http://www.baidu.com/"
        #self.test_str = "030173"
        #self.test_url = "http://www.so.com/"
        #self.test_str = '08010314'
        self.test_url="http://www.renren.com"
        self.test_str="110000000009"
        self.fname = fname
        self.checkedProxyList = []
       
    def checkProxy(self):
        threadpool = []
        for i in range(10):#开辟10个线程放到线程池中
            threadpool.append(ck_process(self.test_url, self.test_str,self.timeout,i))
       
        #启动10个线程同时处理校验工作   
        map(lambda x: x.start(), threadpool)
       
        #等待线程退出
        map(lambda x: x.join(), threadpool)
       
        while ProxyCheckedList.empty() == False:
            try:
                content = ProxyCheckedList.get_nowait()
            except Exception,e:
                print e
            else:
                self.checkedProxyList.append(content)
        print "the checked proxylist contains: %s nums records"%len(self.checkedProxyList)
        for info in self.checkedProxyList:
            print info
       
    def sort(self):
        sorted(self.checkedProxyList, cmp=lambda x,y:cmp(x[4], y[4]))#对响应时间的代理ip列表进行排序
   
    def save(self):
        f = open(self.fname, 'w+')
        for proxy in self.checkedProxyList:
            f.write("%s:%s\t%s\t%s\t%s\n"%(proxy[0], proxy[1], proxy[2], proxy[3], proxy[4]))
        f.close()
   
    def run(self):
        self.checkProxy()
        self.sort()
        self.save()
        print 'Done'        这个类主要继承于线程类threading,它的程序流程主要看run(),它的过程就是上面分析的思路。下面对checkProxy()这个过程进行简要说明,它创建了10个线程,利用map函数让这十个线程同时启动,最后是等待线程的退出。然后对队列中ProxyCheckedList的数据再次加工就完成了。而这10个线程他们干的事情是相同的,他们依次从一个队列ProxyCheckedList中读出10条http代理数据(IP,端口等),然后迭代对这10条数据进行有效性判断,有效性判断的思路如下:

        用urllib2.HTTPCookieProcessor()创建一个cookie,

        利用每个数据的IP和端口构定一个代理句柄对象proxy_handler = urllib2.ProxyHandler({"http":r'http://%s:%s'%(proxy[0],proxy[1])})

        将代理句柄与cookie绑定opener = urllib2.build_opener(cookies, proxy_handler)

        请求对象的安装urllib2.install_opener(opener)

        最后是利用这个代理IP访问某个网站,如果在某一规定的时间内有数据返回,则说明有效,将其放至队列ProxyCheckedList中,否则迭代下一个。下面是代码:


[python]
class ck_process(threading.Thread): 
    '''''线程类:用于多线程处理校验代理IP的有效性''' 
     
    def __init__(self,test_url, test_str, timeout,count): 
        threading.Thread.__init__(self) 
        self.proxy_contain = [] 
        self.test_url = test_url 
        self.test_str = test_str 
        self.checkedProxyList = [] 
        self.timeout = timeout 
        self.count = count 
    def run(self): 
         
        cookies = urllib2.HTTPCookieProcessor()#构造一个cookie对象  
        #print "I'm thread process No. %s" %self.count  
        while proxylist1.empty() == False: 
            if lock_que.acquire():#成功获取锁  
                if proxylist1.qsize() >= 10: 
                    number = 10 
                else: 
                    number = proxylist1.qsize() 
                     
                for i in range(number): #从原始的列表队列中获取出10条代理IP信息  
                    proxy = proxylist1.get_nowait() 
                    self.proxy_contain.append(proxy) 
                    #print "%s thread process:%s"%(self.count,self.proxy_contain)  
                    #print  
                lock_que.release() 
             
                 
            #每个线程每次逐一处理10条      
            for proxy in self.proxy_contain:     
             
                proxy_handler = urllib2.ProxyHandler({"http":r'http://%s:%s'%(proxy[0],proxy[1])})#构造一个代理句柄对象  
                opener = urllib2.build_opener(cookies, proxy_handler)#将代理句柄与cookie绑定  
             
                #模拟一个浏览器,加入http的user-agent头字段,参数放入列表的元组  
                opener.addheaders=[('user-agent','Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31')] 
                urllib2.install_opener(opener)#请求注册  
                t1 = time.time() #get current time  
                try:#有时某些代理无效打不开网站  
                    req = urllib2.urlopen(self.test_url, timeout = self.timeout) 
                    result = req.read() 
                    timeused = time.time() - t1 
                    pos = result.find(self.test_str) 
         
                    if pos > 1: 
                        self.checkedProxyList.append((proxy[0],proxy[1], proxy[2], proxy[3], timeused)) 
                    else: 
                        continue 
                     
                except Exception, e: 
                    #print e.message  
                    continue 
         
        if len(self.checkedProxyList) != 0: 
            if lock_que_cked.acquire(): 
                for proxy in self.checkedProxyList: 
                    ProxyCheckedList.put(proxy) 
                lock_que_cked.release() 
          
        print "%s thread process:out: %s nums"%(self.count, len(self.checkedProxyList)) 

class ck_process(threading.Thread):
    '''线程类:用于多线程处理校验代理IP的有效性'''
   
    def __init__(self,test_url, test_str, timeout,count):
        threading.Thread.__init__(self)
        self.proxy_contain = []
        self.test_url = test_url
        self.test_str = test_str
        self.checkedProxyList = []
        self.timeout = timeout
        self.count = count
    def run(self):
       
        cookies = urllib2.HTTPCookieProcessor()#构造一个cookie对象
        #print "I'm thread process No. %s" %self.count
        while proxylist1.empty() == False:
            if lock_que.acquire():#成功获取锁
                if proxylist1.qsize() >= 10:
                    number = 10
                else:
                    number = proxylist1.qsize()
                   
                for i in range(number): #从原始的列表队列中获取出10条代理IP信息
                    proxy = proxylist1.get_nowait()
                    self.proxy_contain.append(proxy)
                    #print "%s thread process:%s"%(self.count,self.proxy_contain)
                    #print
                lock_que.release()
           
               
            #每个线程每次逐一处理10条   
            for proxy in self.proxy_contain:   
           
                proxy_handler = urllib2.ProxyHandler({"http":r'http://%s:%s'%(proxy[0],proxy[1])})#构造一个代理句柄对象
                opener = urllib2.build_opener(cookies, proxy_handler)#将代理句柄与cookie绑定
           
                #模拟一个浏览器,加入http的user-agent头字段,参数放入列表的元组
                opener.addheaders=[('user-agent','Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31')]
                urllib2.install_opener(opener)#请求注册
                t1 = time.time() #get current time
                try:#有时某些代理无效打不开网站
                    req = urllib2.urlopen(self.test_url, timeout = self.timeout)
                    result = req.read()
                    timeused = time.time() - t1
                    pos = result.find(self.test_str)
       
                    if pos > 1:
                        self.checkedProxyList.append((proxy[0],proxy[1], proxy[2], proxy[3], timeused))
                    else:
                        continue
                   
                except Exception, e:
                    #print e.message
                    continue
       
        if len(self.checkedProxyList) != 0:
            if lock_que_cked.acquire():
                for proxy in self.checkedProxyList:
                    ProxyCheckedList.put(proxy)
                lock_que_cked.release()
        
        print "%s thread process:out: %s nums"%(self.count, len(self.checkedProxyList))        (完)

相关内容

    暂无相关文章

评论关闭