简单的电子邮件爬虫Python代码,邮件爬虫python代码,import reque


import requestsimport retry:    from urllib.parse import urljoinexcept ImportError:    from urlparse import urljoin# regexemail_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')link_re = re.compile(r'href="(.*?)"')def crawl(url):    result = set()    req = requests.get(url)    # Check if successful    if(req.status_code != 200):        return []    # Find links    links = link_re.findall(req.text)    print("\nFound {} links".format(len(links)))    # Search links for emails    for link in links:        # Get an absolute URL for a link        link = urljoin(url, link)        # Find all emails on current page        result.update(email_re.findall(req.text))    return resultif __name__ == '__main__':    emails = crawl('http://www.realpython.com')    print("\nScrapped e-mail addresses:")    for email in emails:        print(email)    print("\n")

评论关闭