Python.biqukan,,"""https:/
Python.biqukan,,"""https:/
"""https://www.biqukan.com"""import requestsimport requests.adaptersimport redisfrom lxml import etreefrom pyquery import PyQuery as pqimport pymongoimport multiprocessingimport datetimeredis_retries = 5 # 最大尝试次数redis_key_chapter = 'redis_key_chapter'mongo_db_name = 'mongo_db_name'mongo_db_table = 'mongo_db_table'def get_url_txt(url, headers, encoding, data=None): ret = '' try: requests.adapters.DEFAULT_RETRIES = 5 session = requests.session() session.keep_alive = False if data is None: response = session.get(url, headers=headers) else: response = session.get(url, headers=headers, data=data) if response.status_code == 200: response.encoding = encoding ret = response.text response.close() session.close() except Exception as e: print(e) return retdef flush_redis_key_chapter(): try: redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).delete( redis_key_chapter) except Exception as e: print(e)def get_chapter_href(url): try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'} encoding = 'gbk' text = get_url_txt(url=url, headers=headers, encoding=encoding) if len(text): doc1 = pq(text)('.listmain') # print(doc1) doc2 = pq(doc1)('a') # print(doc2) for i in doc2: chapter_name = pq(i).text() chapter_href = 'https://www.biqukan.com' + pq(i).attr('href') # print(chapter_name, chapter_href) redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).rpush( redis_key_chapter, chapter_name + '|0|' + chapter_href) # chaptername|count|href except Exception as e: print(e)def flush_mongo_db_table(): try: pymongo.MongoClient('localhost:27017')[mongo_db_name][mongo_db_table].drop() except Exception as e: print(e)def get_chapter_content_(redis_value): try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'} encoding = 'gbk' chapter_name = redis_value.split('|')[0] conn_count = int(redis_value.split('|')[1]) chapter_href = redis_value.split('|')[2] index = chapter_href.split('/') index = index[len(index) - 1].split('.')[0] text = get_url_txt(url=chapter_href, headers=headers, encoding=encoding) if len(text): print(redis_value) doc1 = pq(text)('#content') content = pq(doc1).text() # print(content) pymongo.MongoClient('localhost:27017')[mongo_db_name][mongo_db_table].insert_one( {'chapter_name': index + chapter_name, 'content': content}) elif conn_count < redis_retries: redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).rpush( redis_key_chapter, chapter_name + '|' + str(conn_count + 1) + '|' + chapter_href) # chaptername|count|href except Exception as e: print(e)def get_chapter_content(): try: while redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).llen( redis_key_chapter) > 0: p = multiprocessing.Pool() while True: redis_value = redis.StrictRedis( connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).lpop( redis_key_chapter) if redis_value is None: break redis_value = redis_value.decode(encoding='utf8', errors='ignore') # print(redis_value) p.apply_async(get_chapter_content_, (redis_value,)) p.close() p.join() except Exception as e: print(e)if __name__ == '__main__': start = datetime.datetime.now() print(start.strftime('%Y-%m-%d %H:%M:%S')) # redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).delete( # 'redis_key_chapter') # pymongo.MongoClient('localhost:27017')['db']['table'].drop() pass headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'} encoding = 'gbk' # text = get_url_txt('https://www.biqukan.com/4_4438/', headers, encoding) # print(text) pass # doc1 = pq(text)('.listmain') # print(doc1) # doc2 = pq(doc1)('a') # print(doc2) # for i in doc2: # chapter_name = pq(i).text() # chapter_href = pq(i).attr('href') # print(chapter_name, 'https://www.biqukan.com' + chapter_href) # redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).rpush( # redis_key_chapter, # chapter_name + '|0|' + 'https://www.haotxt.com' + chapter_href) # chaptername|count|href pass # text = get_url_txt('https://www.biqukan.com/4_4438/2098012.html', headers, encoding) # doc1 = pq(text)('#content') # content = pq(doc1).text() # print(content) pass # flush_redis_key_chapter() # get_chapter_href('https://www.biqukan.com/4_4438/') # flush_mongo_db_table() # get_chapter_content() # with open('temp.txt', 'w'): # pass # with open('temp.txt', 'a') as f: # for i in pymongo.MongoClient('localhost:27017')[mongo_db_name][mongo_db_table].find({}).sort( # [('chapter_name', pymongo.ASCENDING)]): # f.write('\t' + i['chapter_name'] + '\n') # f.write(i['content'].replace('\xa0', '').replace('\n\n', '\n') + '\n\n') pass end = datetime.datetime.now() print(end.strftime('%Y-%m-%d %H:%M:%S')) print('cost seconds : %d' % (end - start).seconds) pass
Python.biqukan
评论关闭