PyQuery:文本统计的通用类,实现类SQL接口,pyquerysql,被引用次数最多的10篇论


被引用次数最多的10篇论文

# -*- coding: utf-8 -*-from PyQuery import PyQueryimport time# 文本cite75_99 700百万条记录,每行为一条引用记录,某论文\t该论文引用的论文# 3858241,956203# 3858241,1324234# 3858241,3634889# 3858242,1515701# 3858242,3319261# ......# 文本apat63_99.txt 300百万条记录,每行为一片论文的详细信息,我们左联结查看论文的年份与国家# 3070801,1963,1096,,"BE","",,1,,269,6,69,,1,,0,,,,,,,# 3070802,1963,1096,,"US","TX",,1,,2,6,63,,0,,,,,,,,,# 3070803,1963,1096,,"US","IL",,1,,2,6,63,,9,,0.3704,,,,,,,# 3070804,1963,1096,,"US","OH",,1,,2,6,63,,3,,0.6667,,,,,,,# ......if __name__ == "__main__":    now=time.time()    # Top 100 Paper Cited Most    clumnNameL=['CITING','CITED']    clumnNameSelectL=['CITED']    myPyQueryLeft=PyQuery()    myPyQueryLeft.InputAndSelect('C:\\Users\\Administrator\\Desktop\\ITFiles\\cite75_99_7000000.txt','\t',clumnNameL,clumnNameSelectL)    myPyQueryLeft.GroupBy(['CITED'])    myPyQueryLeft.CountEach('CITED_COUNT')    myPyQueryLeft.Top(['CITED_COUNT'],100)    clumnNameR=['PATENT','GYEAR','GDATE','APPYEAR','COUNTRY','POSTATE','ASSIGNEE','ASSCODE','CLAIMS','NCLASS','CAT','SUBCAT','CMADE','CRECEIVE','RATIOCIT','GENERAL','ORIGINAL','FWDAPLAG','BCKGTLAG','SELFCTUB','SELFCTLB','SECDUPBD','SECDLWBD']    clumnNameSelectR=['PATENT','GYEAR','COUNTRY']    myPyQueryRight=PyQuery()    myPyQueryRight.InputAndSelect('C:\\Users\\Administrator\\Desktop\\ITFiles\\apat63_99.txt',',',clumnNameR,clumnNameSelectR)    myPyQueryLeft.LeftJoin(['CITED'],myPyQueryRight,['PATENT'])    myPyQueryLeft.OutputAsFile('C:\\Users\\Administrator\\Desktop\\cite75_99_top')    print 'total '+str(time.time()-now)+' seconds'

csdn十大常用邮箱

# -*- coding: utf-8 -*-from PyQuery import PyQueryimport time# 文本csdnpwd每行的格式为:用户名#密码#邮箱,例如:# zdg # 123 # zdg@csdn.net# LaoZheng # 670207 # chengming_zheng@163.com# fstao # 123 # fstao@tom.com# ......def getMailHost(oneEmailAddr):    return oneEmailAddr[oneEmailAddr.rfind('@')+1:].strip().lower()if __name__ == "__main__":    now=time.time()    #Top 100 emailhost used by csdn's user    clumnName=['username','password','email']    clumnNameSelect=['email']    myPyQuery=PyQuery()    myPyQuery.InputAndSelect('C:\\Users\\Administrator\\Desktop\\ITFiles\\csdnpwd','#',clumnName,clumnNameSelect)    myPyQuery.Translate(['email'],getMailHost)    myPyQuery.GroupBy(['email'])    myPyQuery.CountEach('email_count')    myPyQuery.Top(['email_count'],100)    myPyQuery.OutputAsFile('C:\\Users\\Administrator\\Desktop\\topemailhost')    myPyQuery.Clean()    print 'total '+str(time.time()-now)+' seconds'

每个旅行团消费最多的三个城市

# -*- coding: utf-8 -*-from PyQuery import PyQueryimport time# 文本travelagent5百万条记录,每行为一个旅行团在某个市的一次消费记录,旅行团\t市\t消费额# 204     21608   0.43# 225     151     1.06# 225     151     2.62# 225     221049  2.53if __name__ == "__main__":    now=time.time()    #Top 3 province of each travelagency which they spend most    clumnName=['travelagency','town','price_d']    clumnNameSelect=['travelagency','town','price_d']    myPyQuery=PyQuery()    myPyQuery.InputAndSelect('C:\\Users\\Administrator\\Desktop\\ITFiles\\2011_travelagent_5000000','\t',clumnName,clumnNameSelect)    myPyQuery.GroupBy(['travelagency','town'])    myPyQuery.SumEach(['price_d'],'price_sum')    myPyQuery.GroupBy(['travelagency'])    myPyQuery.TopEach(['price_sum'],3)    myPyQuery.OutputAsFile('C:\\Users\\Administrator\\Desktop\\travelagent')    myPyQuery.Clean()    print 'total '+str(time.time()-now)+' seconds'

评论关闭