python获得 Alexa网站排名,pythonalexa,<python获得 Al
python获得 Alexa网站排名,pythonalexa,<python获得 Alexa网站排名>python获得 Alexa网站排名代码片段
1.[python获得 Alexa网站排名代码][Python]代码
#coding:utf-8import refrom pyquery import PyQueryimport requestsimport numpyclass AlexaTopSites(object): def __init__(self): self.reginShortNames = {} self.tag_update_reginShortNames = False self.res = {} @property def getRegin(self): not self.tag_update_reginShortNames and self.update_reginShortNames() return self.reginShortNames def update_reginShortNames(self): url = "http://www.alexa.com/topsites/COUNTries" pq = PyQuery(url=url) d = {} re_href = re.compile(r'(?:.*/)(\w+)$') for i in pq("div.categories.top").find("a"): href_short = re_href.match(i.attrib['href']).group(1) if href_short: d[i.text.upper()] = href_short.upper() self.reginShortNames = d self.tag_update_reginShortNames = True def get_topSitesByRegin(self, regin,nums=25): URL_BASE = "http://www.alexa.com/topsites/COUNTries/" URL_PAGES = "http://www.alexa.com/topsites/COUNTries;" PAGE = 1 COUNT = 0 regin = regin.upper() url = URL_BASE + regin try: r = requests.get(url=url) if not r.ok: not self.tag_update_reginShortNames and self.update_reginShortNames() regin = self.reginShortNames.get(regin.upper()) url = URL_BASE + regin r = requests.get(url=url) except: print "ERROR: get url fail." return pq = PyQuery(r.content) lst = [] for i in pq("section.page-product-content section.col-r li"): pq_a = PyQuery(i)("a")[0] pq_description = PyQuery(i)("div.description")[0].text lst.append({ 'url': pq_a.attrib['href'], 'name': pq_a.text, 'description': pq_description, }) COUNT += 1 if COUNT >=nums: self.res[regin] = lst return lst while COUNT < nums: try: url = URL_PAGES + str(PAGE) + "/" + regin r = requests.get(url=url) if r.status_code != 200: print "ERROR: fail to page\n "+url break pq = PyQuery(r.content) for i in pq("section.product-content section.col-r"): pq_a = PyQuery(i)("a")[0] pq_description = PyQuery(i)("div.description")[0].text lst.append({ 'url': pq_a.attrib['href'], 'name': pq_a.text, 'description': pq_description, }) COUNT += 1 PAGE += 1 except: print "ERROR: get more result fail." break self.res[regin] = lst return lst
2.[代码][Python]代码
# TEST:a = AlexaTopSites()for i,e in enumerate(a.get_topSitesByRegin('CN',10)):print i+1,'\n',e['name'],'\n',e['url'],'\n',e['description']1 BaiduThe leading Chinese language search engine, provides "simple and reliable" search exp2 QqChina's largest and most used Internet service portal owned by Tencent, Inc founded in Nov3 TaobaoLaunched in May 2003, Taobao Marketplace (www.taobao.com) is the online shopping destination of4 Sina包括即日的国内外不同类型的新闻与评论,人物专题,图库。5 Hao123Baidu6 Weibo.com 新浪微博是全中国最主流,最具人气,当前最火爆的微博产品。用一句话随意记录生活,用手机随时随地发微博,迅速获取最热最火最快最酷最新的资讯。
编橙之家文章,
python获得 Alexa网站排名代码片段
1.[python获得 Alexa网站排名代码][Python]代码
#coding:utf-8import refrom pyquery import PyQueryimport requestsimport numpyclass AlexaTopSites(object): def __init__(self): self.reginShortNames = {} self.tag_update_reginShortNames = False self.res = {} @property def getRegin(self): not self.tag_update_reginShortNames and self.update_reginShortNames() return self.reginShortNames def update_reginShortNames(self): url = "http://www.alexa.com/topsites/COUNTries" pq = PyQuery(url=url) d = {} re_href = re.compile(r'(?:.*/)(\w+)$') for i in pq("div.categories.top").find("a"): href_short = re_href.match(i.attrib['href']).group(1) if href_short: d[i.text.upper()] = href_short.upper() self.reginShortNames = d self.tag_update_reginShortNames = True def get_topSitesByRegin(self, regin,nums=25): URL_BASE = "http://www.alexa.com/topsites/COUNTries/" URL_PAGES = "http://www.alexa.com/topsites/COUNTries;" PAGE = 1 COUNT = 0 regin = regin.upper() url = URL_BASE + regin try: r = requests.get(url=url) if not r.ok: not self.tag_update_reginShortNames and self.update_reginShortNames() regin = self.reginShortNames.get(regin.upper()) url = URL_BASE + regin r = requests.get(url=url) except: print "ERROR: get url fail." return pq = PyQuery(r.content) lst = [] for i in pq("section.page-product-content section.col-r li"): pq_a = PyQuery(i)("a")[0] pq_description = PyQuery(i)("div.description")[0].text lst.append({ 'url': pq_a.attrib['href'], 'name': pq_a.text, 'description': pq_description, }) COUNT += 1 if COUNT >=nums: self.res[regin] = lst return lst while COUNT < nums: try: url = URL_PAGES + str(PAGE) + "/" + regin r = requests.get(url=url) if r.status_code != 200: print "ERROR: fail to page\n "+url break pq = PyQuery(r.content) for i in pq("section.product-content section.col-r"): pq_a = PyQuery(i)("a")[0] pq_description = PyQuery(i)("div.description")[0].text lst.append({ 'url': pq_a.attrib['href'], 'name': pq_a.text, 'description': pq_description, }) COUNT += 1 PAGE += 1 except: print "ERROR: get more result fail." break self.res[regin] = lst return lst
2.[代码][Python]代码
# TEST:a = AlexaTopSites()for i,e in enumerate(a.get_topSitesByRegin('CN',10)):print i+1,'\n',e['name'],'\n',e['url'],'\n',e['description']1 BaiduThe leading Chinese language search engine, provides "simple and reliable" search exp2 QqChina's largest and most used Internet service portal owned by Tencent, Inc founded in Nov3 TaobaoLaunched in May 2003, Taobao Marketplace (www.taobao.com) is the online shopping destination of4 Sina包括即日的国内外不同类型的新闻与评论,人物专题,图库。5 Hao123Baidu6 Weibo.com 新浪微博是全中国最主流,最具人气,当前最火爆的微博产品。用一句话随意记录生活,用手机随时随地发微博,迅速获取最热最火最快最酷最新的资讯。
编橙之家文章,
相关内容
- 用Python实现最基本的电话本,Python实现电话,<用Pyth
- 用python代码科学上Google,python代码google,用python代码科学
- python终端播放音乐同步显示本地或网络歌词,python播放
- python获取情敌电脑内照片神器,,python获取情敌电脑
- python随机搜索并打开该路径内的文件,,标签: <pyt
- 基于python Selenium的用户登录自动化测试,pythonselenium,使
- 通过python hp ilo对机器的状态进行监控,pythonilo,<通过
- python深度搜索+命令模式 解数独,python解数,python深度搜
- 网友用python把IPv4地址变成LITNET-NAT64网段,ipv4litnet-nat
- python抓取百度音乐mp3歌曲,python抓取百度音乐,python抓取
评论关闭