抓取乌云会员信息,抓取会员信息,抓取乌云网站白帽子会员信


抓取乌云网站白帽子会员信息滴,没太对的技术含量。

1、主要练习python正在表达与httplib模块的使用;

2、技术点:字符集转换转化,使用正则表达式匹配中文。

ps:嘿嘿,可以练习下修改这个脚本抓下厂家的信息嘛~。。。

#!/usr/bin/python###########################################    Platform: Windows 8.1 64Bit         ##    Language: Python                    ##    Version: 2.7                        ###########################################import httplibimport reimport sysdef sendHttp(host,url):     #添加http的协议头信息    header = {"Content-type":"application/x-www-from-urlencoded","Accpet":"text/plain;charset=utf-8"}    conn = httplib.HTTPConnection(host,80,True,10)    conn.request('GET',url+"1",'',header)    response = conn.getresponse()       #第一次成功打开网址    if response.status == 200 :        body = response.read().decode('utf8')               #计算分页,匹配:<p class="page">共 3669 条记录, 184 页              totalpageregex = re.compile(u'"page">[\\u4e00-\\u9fa5]+\\s(\\d+).*?(\\d+)')               totalpagegroup = totalpageregex.search(body)        totalpage = 1        if totalpagegroup :            totalpage = totalpagegroup.group(2)        print "total page:",totalpage        print "start runing...",""              fileDic={}        memberCountDic={}        for currentPage in range(1,int(totalpage)+1):               conn.request('GET',url+repr(currentPage),'',header)            response = conn.getresponse()               print "status:",response.status,"http://"+host+url+repr(currentPage)            if response.status == 200 :                tr = re.compile(r'<tr>\\s+<th>(.*?)</th>\\s+<td>(.*?)</td>\\s+<th>(.*?)</th>\\s+<th>.*?</th>\\s+<th>.*?</th>\\s+<th>(.*?)</th>')                body = response.read()                  tr = tr.findall(body)                for item in tr:                    if item[2] in fileDic.keys():                        fp = fileDic[item[2]]                        memberCountDic[item[2]] += 1                    else:                               fileDic[item[2]] = open(item[2]+'.txt'.decode('utf8'),'w+')                        fp = fileDic[item[2]]                        memberCountDic[item[2]] = 1                    name = re.sub(r'<.*?>','',item[1])                    blog = re.search(r'href="(.*?)"',item[1],re.I).group(1)                    fp.write(item[0] + "|http://" + host + blog.decode('utf8') + "|" + name.decode('utf8') + "|Rank:" + item[3] + "\\n")                    fp.flush()        #释放文件句柄        fp.close()        for k in fileDic.keys():            fileDic[k].close();        print "File successfully saved.",""             #会员人数计算        print "=================================================",""            totalmember=0        for k in memberCountDic.keys():            print k.decode("utf8")+":",memberCountDic[k]            totalmember += memberCountDic[k]        print "total member:",totalmember               print "=================================================",""        else:        print "Url open failed,status code",response.status    conn.close()    if __name__ == "__main__":    reload(sys)    sys.setdefaultencoding('utf8')     host = 'www.wooyun.org'    url = "/whitehats/page/"    sendHttp(host,url)#该片段来自于http://byrx.net

评论关闭