python获得 Alexa网站排名,pythonalexa,
<python获得 Alexa网站排名>

python获得 Alexa网站排名代码片段

1.[python获得 Alexa网站排名代码][Python]代码

#coding:utf-8import refrom pyquery import PyQueryimport requestsimport numpyclass AlexaTopSites(object):    def __init__(self):        self.reginShortNames = {}        self.tag_update_reginShortNames = False        self.res = {}    @property    def getRegin(self):        not self.tag_update_reginShortNames and self.update_reginShortNames()        return self.reginShortNames    def update_reginShortNames(self):        url = "http://www.alexa.com/topsites/COUNTries"        pq = PyQuery(url=url)        d = {}        re_href = re.compile(r'(?:.*/)(\w+)$')        for i in pq("div.categories.top").find("a"):            href_short = re_href.match(i.attrib['href']).group(1)            if href_short:                d[i.text.upper()] = href_short.upper()        self.reginShortNames = d        self.tag_update_reginShortNames = True    def get_topSitesByRegin(self, regin,nums=25):        URL_BASE = "http://www.alexa.com/topsites/COUNTries/"        URL_PAGES = "http://www.alexa.com/topsites/COUNTries;"        PAGE = 1        COUNT = 0        regin = regin.upper()        url = URL_BASE + regin        try:            r = requests.get(url=url)            if not r.ok:                not self.tag_update_reginShortNames and self.update_reginShortNames()                regin = self.reginShortNames.get(regin.upper())                url = URL_BASE + regin                r = requests.get(url=url)        except:            print "ERROR: get url fail."            return        pq = PyQuery(r.content)        lst = []        for i in pq("section.page-product-content section.col-r li"):            pq_a = PyQuery(i)("a")[0]            pq_description = PyQuery(i)("div.description")[0].text            lst.append({                'url': pq_a.attrib['href'],                'name': pq_a.text,                'description': pq_description,                })            COUNT += 1            if COUNT >=nums:                self.res[regin] = lst                return lst        while COUNT < nums:            try:                url = URL_PAGES + str(PAGE) + "/" + regin                r = requests.get(url=url)                if r.status_code != 200:                    print "ERROR: fail to page\n  "+url                    break                pq = PyQuery(r.content)                for i in pq("section.product-content section.col-r"):                    pq_a = PyQuery(i)("a")[0]                    pq_description = PyQuery(i)("div.description")[0].text                    lst.append({                        'url': pq_a.attrib['href'],                        'name': pq_a.text,                        'description': pq_description,                    })                COUNT += 1                PAGE += 1            except:                print "ERROR: get more result fail."                break        self.res[regin] = lst        return lst

2.[代码][Python]代码

# TEST:a = AlexaTopSites()for i,e in enumerate(a.get_topSitesByRegin('CN',10)):print i+1,'\n',e['name'],'\n',e['url'],'\n',e['description']1 BaiduThe leading Chinese language search engine, provides "simple and reliable" search exp2 QqChina's largest and most used Internet service portal owned by Tencent, Inc founded in Nov3 TaobaoLaunched in May 2003, Taobao Marketplace (www.taobao.com) is the online shopping destination of4 Sina包括即日的国内外不同类型的新闻与评论,人物专题,图库。5 Hao123Baidu6 Weibo.com 新浪微博是全中国最主流,最具人气,当前最火爆的微博产品。用一句话随意记录生活,用手机随时随地发微博,迅速获取最热最火最快最酷最新的资讯。

编橙之家文章,

评论关闭