PyQuery:文本统计的通用类,实现类SQL接口,pyquerysql,被引用次数最多的10篇论
PyQuery:文本统计的通用类,实现类SQL接口,pyquerysql,被引用次数最多的10篇论
被引用次数最多的10篇论文
# -*- coding: utf-8 -*-from PyQuery import PyQueryimport time# 文本cite75_99 700百万条记录,每行为一条引用记录,某论文\t该论文引用的论文# 3858241,956203# 3858241,1324234# 3858241,3634889# 3858242,1515701# 3858242,3319261# ......# 文本apat63_99.txt 300百万条记录,每行为一片论文的详细信息,我们左联结查看论文的年份与国家# 3070801,1963,1096,,"BE","",,1,,269,6,69,,1,,0,,,,,,,# 3070802,1963,1096,,"US","TX",,1,,2,6,63,,0,,,,,,,,,# 3070803,1963,1096,,"US","IL",,1,,2,6,63,,9,,0.3704,,,,,,,# 3070804,1963,1096,,"US","OH",,1,,2,6,63,,3,,0.6667,,,,,,,# ......if __name__ == "__main__": now=time.time() # Top 100 Paper Cited Most clumnNameL=['CITING','CITED'] clumnNameSelectL=['CITED'] myPyQueryLeft=PyQuery() myPyQueryLeft.InputAndSelect('C:\\Users\\Administrator\\Desktop\\ITFiles\\cite75_99_7000000.txt','\t',clumnNameL,clumnNameSelectL) myPyQueryLeft.GroupBy(['CITED']) myPyQueryLeft.CountEach('CITED_COUNT') myPyQueryLeft.Top(['CITED_COUNT'],100) clumnNameR=['PATENT','GYEAR','GDATE','APPYEAR','COUNTRY','POSTATE','ASSIGNEE','ASSCODE','CLAIMS','NCLASS','CAT','SUBCAT','CMADE','CRECEIVE','RATIOCIT','GENERAL','ORIGINAL','FWDAPLAG','BCKGTLAG','SELFCTUB','SELFCTLB','SECDUPBD','SECDLWBD'] clumnNameSelectR=['PATENT','GYEAR','COUNTRY'] myPyQueryRight=PyQuery() myPyQueryRight.InputAndSelect('C:\\Users\\Administrator\\Desktop\\ITFiles\\apat63_99.txt',',',clumnNameR,clumnNameSelectR) myPyQueryLeft.LeftJoin(['CITED'],myPyQueryRight,['PATENT']) myPyQueryLeft.OutputAsFile('C:\\Users\\Administrator\\Desktop\\cite75_99_top') print 'total '+str(time.time()-now)+' seconds'
csdn十大常用邮箱
# -*- coding: utf-8 -*-from PyQuery import PyQueryimport time# 文本csdnpwd每行的格式为:用户名#密码#邮箱,例如:# zdg # 123 # zdg@csdn.net# LaoZheng # 670207 # chengming_zheng@163.com# fstao # 123 # fstao@tom.com# ......def getMailHost(oneEmailAddr): return oneEmailAddr[oneEmailAddr.rfind('@')+1:].strip().lower()if __name__ == "__main__": now=time.time() #Top 100 emailhost used by csdn's user clumnName=['username','password','email'] clumnNameSelect=['email'] myPyQuery=PyQuery() myPyQuery.InputAndSelect('C:\\Users\\Administrator\\Desktop\\ITFiles\\csdnpwd','#',clumnName,clumnNameSelect) myPyQuery.Translate(['email'],getMailHost) myPyQuery.GroupBy(['email']) myPyQuery.CountEach('email_count') myPyQuery.Top(['email_count'],100) myPyQuery.OutputAsFile('C:\\Users\\Administrator\\Desktop\\topemailhost') myPyQuery.Clean() print 'total '+str(time.time()-now)+' seconds'
每个旅行团消费最多的三个城市
# -*- coding: utf-8 -*-from PyQuery import PyQueryimport time# 文本travelagent5百万条记录,每行为一个旅行团在某个市的一次消费记录,旅行团\t市\t消费额# 204 21608 0.43# 225 151 1.06# 225 151 2.62# 225 221049 2.53if __name__ == "__main__": now=time.time() #Top 3 province of each travelagency which they spend most clumnName=['travelagency','town','price_d'] clumnNameSelect=['travelagency','town','price_d'] myPyQuery=PyQuery() myPyQuery.InputAndSelect('C:\\Users\\Administrator\\Desktop\\ITFiles\\2011_travelagent_5000000','\t',clumnName,clumnNameSelect) myPyQuery.GroupBy(['travelagency','town']) myPyQuery.SumEach(['price_d'],'price_sum') myPyQuery.GroupBy(['travelagency']) myPyQuery.TopEach(['price_sum'],3) myPyQuery.OutputAsFile('C:\\Users\\Administrator\\Desktop\\travelagent') myPyQuery.Clean() print 'total '+str(time.time()-now)+' seconds'
评论关闭