一个简单的类封装,提供简单的web操作,类封装web操作,#!user/bin/e
#!user/bin/env python#-*- coding:utf-8 -*-'''对urllib和urllib2的简单封装,方便使用。提供header伪装、历史记录、gzip解码功能。'''import urllib,urllib2,cookielibimport gzip,zlib,StringIOimport os,re,sys,timeclass Client(object): # 默认的进度显示函数 @staticmethod def Percent(a,b,c): i=float(a*b*100)/c if i>100.0:i=100.0 x=int(i/2) sys.stderr.write('|') for j in xrange(x):sys.stderr.write('=') for j in xrange(50-x):sys.stderr.write('-') sys.stderr.write('|%6.2f%%\\r'%(i)) # 创建一个opener,伪装成浏览器 def __init__(self,proxy=None): self.trace=[] self.header={ 'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language':'zh-CN,zh;q=0.8', 'Accept-Encoding':'gzip,deflate', 'Accept-Charset':'GBK,utf-8;q=0.7,*;q=0.7', 'Connection':'Keep-Alive' } self.timeout=5 self.proxy=proxy self.resp=None self.page=None self.cookie=urllib2.HTTPCookieProcessor(cookielib.CookieJar()) if proxy: self.opener=urllib2.build_opener(self.cookie,urllib2.ProxyHandler(proxy),urllib2.HTTPHandler) else: self.opener=urllib2.build_opener(self.cookie,urllib2.HTTPHandler) # 关闭opener def __del__(self): self.opener.close() # 建立网页连接 def call(self,url,query=None,data=None): if query: url='%s?%s'%(url,urllib.urlencode(query)) if self.trace: ref=self.trace[-1] if 'Host' in self.header and ref.split('/')[1]==self.header['Host']: self.header['Referer']=ref else: if 'Host' in self.header:del self.header['Host'] if 'Referer' in self.header:del self.header['Referer'] req=urllib2.Request(url,headers=self.header) req.timeout=self.timeout if data: self.resp=self.opener.open(req,urllib.urlencode(data)) else: self.resp=self.opener.open(req) self.header['Host']=url.split('://')[1].split('/')[0] self.trace.append(url) # 读取网页内容,如果是压缩数据则自动解码 def read(self,len=0): if len>0: medi=self.resp.read(len) else: medi=self.resp.read() if self.resp.headers.get('content-encoding')=='deflate': try: self.page=zlib.decompress(medi,-zlib.MAX_WBITS) except zlib.error: self.page=zlib.decompress(medi) elif self.resp.headers.get('content-encoding')=='gzip': obj=StringIO.StringIO(medi) self.page=gzip.GzipFile(fileobj=obj,mode="r").read() else: self.page=medi return self.page # 下载文件并可显示进度 def load(self,url,name,fac=None): self.call(url) size=int(self.resp.headers.getheader('content-length')) have=0 if not fac: fac=Client.Percent fac(0,1,size) try: with open(name,'wb') as file: while have<size: data=self.read(65536) file.write(data) have+=65536 fac(have,1,size) return True except: return False # 打开并获取页面内容 def obtain(self,url,query=None,data=None): self.call(url,query,data) return self.read() # 正则表达式提取第一个 def search(self,rule): if self.page: return re.search(rule,self.page) return None # 正则表达式提取全部 def getall(self,rule): if self.page: return re.findall(rule,self.page) return []class Webproxy(object): def __init__(self,web,proxy=None): self.client=Client(proxy) self.webpxy=web self.query={'u':None,'b':'4','f':'norefer'} def __del__(self): del self.client def call(self,url,query=None,data=None): self.query['u']=url trs='http://%s/browse.php?%s'%(self.webpxy,urllib.urlencode(self.query)) self.client.call(trs,query,data) if 'f' in self.query:del self.query['f'] def read(self,len=0): return self.client.read(len) def load(self,url,name,fac=None): return self.client.call(url,name,fac) def obtain(self,url,query=None,data=None): self.client.call(url,query,data) return self.client.read() def search(self,rule): return self.client.search(rule,self.page) def getall(self,rule): return self.client.findall(rule,self.page)if __name__=='__main__': pass#该片段来自于http://byrx.net
评论关闭