Python爪巴虫,,from bs4 i


from bs4 import BeautifulSoupfrom urllib.request import urlopenimport rehtml = urlopen("https://morvanzhou.github.io/static/scraping/table.html").read().decode(‘utf-8‘)# print(html)soup = BeautifulSoup(html, features=‘lxml‘)print(soup.h1)# <h1>标题</h1>print(soup.p)# <p>段落</p># 爬取全部链接all_href = soup.find_all(‘a‘)all_href = [l[‘href‘] for l in all_href]print(‘\n‘, all_href)# 利用Class爬取信息month = soup.find_all(‘li‘, {"class": "month"})for m in month:    print(m)    # <li class="month">XXX</li>    print(m.get_text())    # XXX# 用正则表达式限制,爬取图片img_links = soup.find_all("img", {"src": re.compile(‘.*?\.jpg‘)}) # 以任意字符开头,.jpg结尾print(img_links)# [<img src="https://morvanzhou.github.io/static/img/course_cover/tf.jpg"/>]for link in img_links:    print(link[‘src‘])# https://morvanzhou.github.io/static/img/course_cover/tf.jpg# 用正则表达式限制,爬取链接course_links = soup.find_all(‘a‘, {‘href‘: re.compile(‘https://morvan.*‘)})print(course_links)# [<a href="https://morvanzhou.github.io/">莫烦 Python</a>]for link in course_links:    print(link[‘href‘])# https://morvanzhou.github.io/
import requestsimport webbrowser# getparam = {"wd": "莫烦Python"}  # 搜索的信息r = requests.get(‘http://www.baidu.com/s‘, params=param)print(r.url)# http://www.baidu.com/s?wd=%E8%8E%AB%E7%83%A6Pythonwebbrowser.open(r.url)# postdata = {‘firstname‘: ‘莫烦‘, ‘lastname‘: ‘周‘}  # 提交的信息r = requests.post(‘http://pythonscraping.com/files/processing.php‘, data=data)print(r.text)# Hello there, 莫烦 周!# 上传图片file = {‘uploadFile‘: open(‘./image.png‘, ‘rb‘)}r = requests.post(    ‘http://pythonscraping.com/files/processing2.php‘, files=file)print(r.text)# The file image.png has been uploaded.# session 登录操作session = requests.Session()payload = {‘username‘: ‘Morvan‘, ‘password‘: ‘password‘}r = session.post(    ‘http://pythonscraping.com/pages/cookies/welcome.php‘, data=payload)print(r.cookies.get_dict())# {‘username‘: ‘Morvan‘, ‘loggedin‘: ‘1‘}r = session.get("http://pythonscraping.com/pages/cookies/profile.php")print(r.text)# Hey Morvan! Looks like you‘re still logged into the site!

Python爪巴虫

评论关闭