OSCer 都在干嘛?,OSCer干嘛?,import reque


import requestsimport lxmlfrom lxml import etreedef get_content(url):    try:        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:37.0) Gecko/20100101 Firefox/37.0'}        r = requests.get(url, headers = headers)        return '<html>' + r.content.decode('utf-8') + '</html>'    except Exception as exc:        #raise exc        return Noneclass User(object):    passdef parse_content(content):    if content is None: return []    try:        xml = etree.fromstring(content)        divs= xml.findall('li/div')         page_info = []        for div in divs:            u = User()            tweets = div.findall('p')            if len(tweets) < 2:                continue            user_and_text = list(tweets[0].itertext())            u.name,u.txt =  user_and_text if len(user_and_text) == 2 else [None]*2             data_comments = list(tweets[1].itertext())            u.data,u.commtents,u.client = data_comments if len(data_comments) ==3 else [None]*3            u.link = tweets[1].xpath('a')[0].get('href')             page_info.append(u)        return page_info    except Exception as exc:        raise excdef test():    url = "http://www.oschina.net/fetch_tweets?p=40"    content = get_content(url)    infos = parse_content(content)    for u in infos:    print(u.name)    print(u.txt)    print(u.client.split('\n')[1].strip() if u.client else u.client)                #print(u.name, u.txt)if __name__ == '__main__':    test()

评论关闭