python正则表达式学习代码,python正则表达式,Regexp.py'''


Regexp.py

'''Created on 2012年11月28日@author: jiangxiaoqiang本篇关于Python中的正则表达式'''#coding = utf-8import redef regexpTest():    # match = re.search(pat, mystr)    # All of the pattern must be matched, but not all of the string    print("re.search():")    mystr = 'an example word:cat!!'    match = re.search(r'word:\w\w\w', mystr) # r表示raw string,\不会被转义    # print(type(match)) # <class '_sre.SRE_Match'>    if(match):        print('Found: ' + match.group())    else:        print('Didn\'t found!')    print("-------------------------------------")    # 邮箱验证例子    print("邮箱验证:")    email_regexp = r'^[\w-]+(\.[\w-]+)*@([\w-]+\.)+[a-zA-Z]+$' # 检验某个邮箱不能省略^和$    email = 'feichexia@yahoo.com.cn'    email_match = re.search(email_regexp, email)    if(email_match):        print(email, 'is valid!')    else:        print(email, 'is not valid!')    print("-------------------------------------")    # re.findall()找到字符串中所有匹配子字符串    print("re.findall():")    email_regexp = r'[\w\.-]+@[\w\.-]+'    email_str = 'purple alice@google.com, blah monkey bob@abc.com blah dishwasher'    email_matches = re.findall(email_regexp, email_str)    print(email_matches)    for email in email_matches:        print(email)    print("-------------------------------------")    # 上面的email_matches是['alice@google.com', 'bob@abc.com']    # 即字符串的list    # 继续看下面的元组,为什么正则相同,字符串相同,结果不同?自己找找看有啥不同,我开始困惑了很久    # 正则真的相同?O(∩_∩)O    print('re.findall():元组')    mystr = 'purple alice@google.com, blah monkey bob@abc.com blah dishwasher'    tuples = re.findall(r'([\w\.-]+)@([\w\.-]+)', mystr)    print(tuples)      # [('alice', 'google.com'), ('bob', 'abc.com')]    # 即元组的list,其实是group(n)的元组    for t in tuples:        print('用户名:' + str(t[0]))        print('Host: ' + str(t[1]))    print("-------------------------------------")    # 相信你已经发现了不同了,答案就出在圆括号上,圆括号在正则表达式中被用来分组    # 指定正则匹配选项    # 包括这些选项:    # re.IGNORECASE 忽略大小写    # re.MULTILINEWithin     #    a string made of many lines, allow ^ and $ to match the start and end of each line. Normally ^/$ would just match the start and end of the whole string.    # re.DOTALL     #    allow dot (.) to match newline --     #    normally it matches anything but newline.     #    This can trip you up -- you think .* matches everything,     #    but by default it does not go past the end of a line.     #    Note that \s (whitespace) includes newlines,     #    so if you want to match a run of whitespace that may     #    include a newline, you can just use \s*    print("带额外匹配选项的正则匹配:")    multistr = '''Foo None what bar    Not know universe    True nong'''    regexp = r'no'    matches = re.findall(regexp, multistr, re.IGNORECASE)    print("共匹配个数:" + str(len(matches)))    print("-------------------------------------")    print("高级匹配规则之正向预搜索与反向预搜索:")    print("正向预搜索(或者正向预查),后面必须是:")    str2 = 'none know no'    regexp2 = r'no(?=w)' # no后面必须是w才匹配,括号中的内容不会被捕获    matches2 = re.findall(regexp2, str2)    for m in matches2:        print(m)    print("-------------------------------------")    print('正向预搜索(或者正向预查),后面必须不是:')    regexp3 = r'no(?!w)' # no后面不是w才匹配,括号中的内容不会被捕获    matches3 = re.findall(regexp3, str2)    for m in matches3:        print(m)    print("-------------------------------------")    print('负向预搜索(或者负向预查),前面必须是:')    regexp4 = r'(?<=k)no' # no前面必须是k才匹配,括号中的内容不会被捕获    matches4 = re.findall(regexp4, str2)    for m in matches4:        print(m)    print("-------------------------------------")    print('负向预搜索(或者负向预查),前面必须不是:')    regexp5 = r'(?<!k)no' # no前面必须不是k才匹配,括号中的内容不会被捕获    matches5 = re.findall(regexp5, str2)    for m in matches5:        print(m)    print("-------------------------------------")    # 贪婪匹配与非贪婪匹配    print("高级匹配规则之贪婪匹配与非贪婪匹配:")    # Google Python教程关于这个有一段比较清晰的阐述如下:(我翻译的)    '''            假如你想匹配每个HTML标签,HTML内容是: <b>foo</b> and <i>so on</i>            你想用'(<.*>)'来匹配每个HTML标签。那么它将首先匹配什么呢?           结果有点出乎意料,因为 .* 是贪婪匹配,它将匹配 b>foo</b> and <i>so on</i>           有一种方法可以让匹配是非贪婪的,那就是在后面加上 ?,比如 .*? 和 .+?,这样就使得它们是           非贪 婪的。           所以'(<.*?>)'将第一个匹配'<b>',第二个匹配'</b>'...           匹配过程简单来说是这样的:找到'<'之后,马上向后查找是否有'>'。    *?最早来源于Perl。支持Perl正则语法的正则表达式称为Perl兼容正则。          如果不使用非贪婪匹配,还有另外一种解决方案,使用中括号。比如上面的例子可以用这个正则来完成:   '(<[^>]*?>)'         这也是一种经典做法!    '''    print("非贪婪匹配做法:")    html_str = '<b>foo</b> and <i>bar</i>'    regex_tag = '<.*?>'    tags = re.findall(regex_tag, html_str)    for t in tags:        print(t)    print("-------------------------------------")    print('中括号做法:')    regex_tag2 = '<[^>]*?>'    tags2 = re.findall(regex_tag2, html_str)    for t in tags2:        print(t)    print("-------------------------------------")    # re.sub(pattern, replacement, str)     # str中所有与pattern匹配的子串都被replacement替换    # replacement可以包含 '\1', '\2',表示对分组的引用    # 即分别表示group(1), group(2)    print("re.sub()用法:")    print(re.sub(r'(\w+), (\w+)', r'\2, \1', "John, Smith")) # Smith, Johndef main():    regexpTest()if __name__ == '__main__':    main()

评论关闭