Python采集百度地图数据,python采集地图,百度利用其强大的中文搜索


百度利用其强大的中文搜索引擎数据,结合地图应用,包含了海量的公司联系方式,比Google要强,更别说什么黄页网站了。因为一些业务需要,写了这个行业公司地址采集程序,使用方便,直接运行,支持命令行设定查询参数。使用方法:把代码保存成bmap.pypython bmap.py或python bmap.py服饰厂运行后会自动采集百度地图中所有的结果,保存为以tab分割的txt文件,方便导入各种数据库。

bmap.py

#!/usr/bin/env python# -*- coding: utf-8 -*-## Copyright 2012 Channing Wong## @mail: channing.wong@yahoo.com# @home: http://blog.3363.me/# @date: Mar 3, 2012#import jsonimport sysimport timeimport typesimport urllibreload(sys)sys.setdefaultencoding('utf-8')class BaiduMap:    """    """    def __init__(self, keyword):        self.keyword = keyword        self.query = [                ('b', '(-1599062.039999999,811604.75;24779177.96,8168020.75)'),                ('c', '1'),                ('from', 'webmap'),                ('ie', 'utf-8'),                ('l', '4'),                ('newmap', '1'),                ('qt', 's'),                ('src', '0'),                ('sug', '0'),                ('t', time.time().__int__()),                ('tn', 'B_NORMAL_MAP'),                ('wd', keyword),                ('wd2', '')                 ]        self.mapurl = 'http://map.baidu.com/'        self.file = open('%s.txt' % keyword, 'w')        self.count = 0        self.count_c = 0        self.total_num = 0        self._get_city()    def _fetch(self, query=None, json=True):        data = urllib.urlencode(query)        url = self.mapurl + '?' + data        opener = urllib.FancyURLopener()        data = opener.open(url).read()        if json:            return self._tojson(data)        else:            return data    def _tojson(self, data):        try:            js = json.loads(data, 'utf-8')        except:            js = None        return js    def _get_city(self):        data = self._fetch(self.query)        if type(data['content']) is not types.ListType:            print 'keyworld error.'            sys.exit()        self.city = data['content']        if data.has_key('more_city'):            for c in data['more_city']:                self.city.extend(c['city'])        for city in self.city:            self.total_num += city['num']    def _get_data(self, city, page=0):        query = [                ('addr', '0'),                ('b', '(%s)' % city['geo'].split('|')[1]),                ('c', city['code']),                ('db', '0'),                ('gr', '3'),                ('ie', 'utf-8'),                ('l', '9'),                ('newmap', '1'),                ('on_gel', '1'),                ('pn', page),                ('qt', 'con'),                ('src', '7'),                ('sug', '0'),                ('t', time.time().__int__()),                ('tn', 'B_NORMAL_MAP'),                ('wd', self.keyword),                ('wd2', ''),                 ]        data = self._fetch(query)        return data    def _save(self, content, city):        for c in content:            self.count += 1            self.count_c += 1            if c.has_key('tel'):                tel = c['tel']            else:                tel = ''            _data = '%s\t%s\t%s\t%s\n' % (city['name'], c['name'], c['addr'], tel)            self.file.write(_data)            print '(%s/%s) %s[%s/%s]' % (self.count, self.total_num, city['name'], self.count_c, city['num'])    def get(self, city):        self.count_c = 0        pages = abs(-city['num'] / 10)        for page in range(0, pages):            data = self._get_data(city, page)            if data.has_key('content'):                self._save(data['content'], city)    def get_all(self):        for city in self.city:            self.get(city)        self.file.close()if __name__ == '__main__':    if sys.argv.__len__() > 1:        keyword = sys.argv[1]    else:        keyword = '钻石'    baidumap = BaiduMap(keyword)    print '_' * 20    print 'CITY: %s' % baidumap.city.__len__()    print 'DATA: %s' % baidumap.total_num    baidumap.get_all()

评论关闭