Python.biqukan,,"""https:/


"""https://www.biqukan.com"""import requestsimport requests.adaptersimport redisfrom lxml import etreefrom pyquery import PyQuery as pqimport pymongoimport multiprocessingimport datetimeredis_retries = 5  # 最大尝试次数redis_key_chapter = 'redis_key_chapter'mongo_db_name = 'mongo_db_name'mongo_db_table = 'mongo_db_table'def get_url_txt(url, headers, encoding, data=None):    ret = ''    try:        requests.adapters.DEFAULT_RETRIES = 5        session = requests.session()        session.keep_alive = False        if data is None:            response = session.get(url, headers=headers)        else:            response = session.get(url, headers=headers, data=data)        if response.status_code == 200:            response.encoding = encoding            ret = response.text        response.close()        session.close()    except Exception as e:        print(e)    return retdef flush_redis_key_chapter():    try:        redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).delete(            redis_key_chapter)    except Exception as e:        print(e)def get_chapter_href(url):    try:        headers = {            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}        encoding = 'gbk'        text = get_url_txt(url=url, headers=headers, encoding=encoding)        if len(text):            doc1 = pq(text)('.listmain')            # print(doc1)            doc2 = pq(doc1)('a')            # print(doc2)            for i in doc2:                chapter_name = pq(i).text()                chapter_href = 'https://www.biqukan.com' + pq(i).attr('href')                # print(chapter_name, chapter_href)                redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).rpush(                    redis_key_chapter,                    chapter_name + '|0|' + chapter_href)  # chaptername|count|href    except Exception as e:        print(e)def flush_mongo_db_table():    try:        pymongo.MongoClient('localhost:27017')[mongo_db_name][mongo_db_table].drop()    except Exception as e:        print(e)def get_chapter_content_(redis_value):    try:        headers = {            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}        encoding = 'gbk'        chapter_name = redis_value.split('|')[0]        conn_count = int(redis_value.split('|')[1])        chapter_href = redis_value.split('|')[2]        index = chapter_href.split('/')        index = index[len(index) - 1].split('.')[0]        text = get_url_txt(url=chapter_href, headers=headers, encoding=encoding)        if len(text):            print(redis_value)            doc1 = pq(text)('#content')            content = pq(doc1).text()            # print(content)            pymongo.MongoClient('localhost:27017')[mongo_db_name][mongo_db_table].insert_one(                {'chapter_name': index + chapter_name, 'content': content})        elif conn_count < redis_retries:            redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).rpush(                redis_key_chapter,                chapter_name + '|' + str(conn_count + 1) + '|' + chapter_href)  # chaptername|count|href    except Exception as e:        print(e)def get_chapter_content():    try:        while redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).llen(                redis_key_chapter) > 0:            p = multiprocessing.Pool()            while True:                redis_value = redis.StrictRedis(                    connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).lpop(                    redis_key_chapter)                if redis_value is None:                    break                redis_value = redis_value.decode(encoding='utf8', errors='ignore')                # print(redis_value)                p.apply_async(get_chapter_content_, (redis_value,))            p.close()            p.join()    except Exception as e:        print(e)if __name__ == '__main__':    start = datetime.datetime.now()    print(start.strftime('%Y-%m-%d %H:%M:%S'))    # redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).delete(    #     'redis_key_chapter')    # pymongo.MongoClient('localhost:27017')['db']['table'].drop()    pass    headers = {        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}    encoding = 'gbk'    # text = get_url_txt('https://www.biqukan.com/4_4438/', headers, encoding)    # print(text)    pass    # doc1 = pq(text)('.listmain')    # print(doc1)    # doc2 = pq(doc1)('a')    # print(doc2)    # for i in doc2:    #     chapter_name = pq(i).text()    #     chapter_href = pq(i).attr('href')    #     print(chapter_name, 'https://www.biqukan.com' + chapter_href)    #     redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).rpush(    #         redis_key_chapter,    #         chapter_name + '|0|' + 'https://www.haotxt.com' + chapter_href)  # chaptername|count|href    pass    # text = get_url_txt('https://www.biqukan.com/4_4438/2098012.html', headers, encoding)    # doc1 = pq(text)('#content')    # content = pq(doc1).text()    # print(content)    pass    # flush_redis_key_chapter()    # get_chapter_href('https://www.biqukan.com/4_4438/')    # flush_mongo_db_table()    # get_chapter_content()    # with open('temp.txt', 'w'):    #     pass    # with open('temp.txt', 'a') as f:    #     for i in pymongo.MongoClient('localhost:27017')[mongo_db_name][mongo_db_table].find({}).sort(    #             [('chapter_name', pymongo.ASCENDING)]):    #         f.write('\t' + i['chapter_name'] + '\n')    #         f.write(i['content'].replace('\xa0', '').replace('\n\n', '\n') + '\n\n')    pass    end = datetime.datetime.now()    print(end.strftime('%Y-%m-%d %H:%M:%S'))    print('cost seconds : %d' % (end - start).seconds)    pass

Python.biqukan

评论关闭