Python-爬虫-针对有frame框架的页面,,有的页面会使用fra


有的页面会使用frame 框架,使用Selenium + PhantomJS 后并不会加载iframe 框架中的网页内容。iframe 框架相当于在页面中又加载了一个页面,需要使用Selenium 的 switch_to.frame() 方法加载(官网给的方法是switch_to_frame(),但是IDE提醒使用前面的方法替代该方法)。

比如:

driver.switch_to.frame(‘g_iframe‘)

html = driver.page_source

然后结合BeautifulSoup获取网页中信息。

这次我们爬取http://music.163.com/#/artist/album?id=101988&limit=120&offset=0页面中的专辑信息,比如,图片、网址及专辑名字。

技术分享
"""http://music.163.com/#/artist/album?id=101988&limit=120&offset=0爬取上述网址中的专辑信息"""from selenium import webdriverfrom urllib.request import urlretrieveimport osfrom bs4 import BeautifulSoupclass DownloadInfo():    def __init__(self):        self.url = ‘http://music.163.com/#/artist/album?id=101988&limit=120&offset=0‘        self.basePath = os.path.dirname(__file__)    def makedir(self, name):        path = os.path.join(self.basePath, name)        isExist = os.path.exists(path)        if not isExist:            os.makedirs(path)            print(‘The file is created now.‘)        else:            print(‘The file existed.‘)        #切换到该目录下        os.chdir(path)        return path    def connect(self, url):        driver = webdriver.PhantomJS()        driver.get(url)        print(‘success‘)        return driver    def getFileNames(self, path):        pic_names = os.listdir(path)        return pic_names    def getInfo(self):        driver = self.connect(self.url)        driver.switch_to.frame(‘g_iframe‘)        path = self.makedir(‘Infos‘)        pic_names = self.getFileNames(path)        imgs = driver.find_elements_by_xpath("//div[@class=‘u-cover u-cover-alb3‘]/img")        titles = driver.find_elements_by_xpath("//li/p[@class=‘dec dec-1 f-thide2 f-pre‘]/a")        dates = driver.find_elements_by_xpath("//span[@class=‘s-fc3‘]")        count = 0        for img in imgs:            album_name = titles[count].text            count += 1            photo_name = album_name.replace(‘/‘, ‘‘) + ‘.jpg‘            print(photo_name)            if photo_name in pic_names:                print(‘图片已下载。‘)            else:                urlretrieve(img.get_attribute(‘src‘), photo_name)        for title in titles:            print(title.text)        for date in dates:            print(date.text)"""    def getInfo(self):        driver = self.connect(self.url)        driver.switch_to.frame(‘g_iframe‘)        html = driver.page_source        path = self.makedir(‘Infos‘)        pic_names  = self.getFileNames(path)        all_li = BeautifulSoup(html, ‘lxml‘).find(id=‘m-song-module‘).find_all(‘li‘)        for li in all_li:            album_img = li.find(‘img‘)[‘src‘]            album_name = li.find(‘p‘, class_=‘dec‘)[‘title‘]            album_date = li.find(‘span‘, class_=‘s-fc3‘).get_text()            print(album_img)            print(album_name)            print(album_date)            photo_name = album_name.replace(‘/‘, ‘‘) + ‘.jpg‘            if photo_name in pic_names:                print(‘图片已下载。‘)            else:                urlretrieve(album_img, photo_name)"""if __name__ == ‘__main__‘:    obj = DownloadInfo()    obj.getInfo()
View Code

Python-爬虫-针对有frame框架的页面

评论关闭