过滤html

value = re.sub('<[^>]+>','', value)

import re
 
value = '''
<div> sdb</div>
'''
 
value = re.sub('<[^>]+>','', value) #过滤HTML标签
value

'\n sdb\n'

value.replace('\n',' ')

过滤乱码re.sub()

函数说明sub(pattern,repl,string)把字符串中的所有匹配表达式pattern中的地方替换成repl[^**]表示不匹配此字符集中的任何一个字符\u4e00-\u9fa5汉字的unicode范围\u0030-\u0039数字的unicode范围\u0041-\u005a大写字母unicode范围\u0061-\u007a小写字母unicode范围

import re
string = "北京大学beijing985大学@#￥……&{}*@$%）..+_)("
str = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])","",string)
print(str)

北京大学beijing985大学

# 提取中文
str = re.sub(u"([^\u4e00-\u9fa5])","",string)
print(str )

北京大学大学

# 提取中文和数字
str = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039])","",string)
print(str )

北京大学985大学

获取<tr></tr>标签之间内容

res_tr = r'<tr>(.*?)</tr>'
m_tr =  re.findall(res_tr,language,re.S|re.M)

import re
 
language = '''<tr><th>性別：</th><td>男</td></tr><tr>'''
 
res_tr = r'<tr>(.*?)</tr>'
m_tr =  re.findall(res_tr,language,re.S|re.M)
for line in m_tr:
    print(line)
    #获取表格第一列th 属性
    res_th = r'<th>(.*?)</th>'  
    m_th = re.findall(res_th,line,re.S|re.M)
    for mm in m_th:
        print("<th>",mm)
    #获取表格第二列td 属性值
    res_td = r'<td>(.*?)</td>'
    m_td = re.findall(res_td,line,re.S|re.M)
    for nn in m_td:
         print("<td>",nn)

<th>性別：</th><td>男</td>
<th> 性別：
<td> 男

获取超链接<a href=..></a>之间内容

res = r'<a .*?>(.*?)</a>'
mm =  re.findall(res, content, re.S|re.M)
urls=re.findall(r"<a.*?href=.*?<\/a>", content, re.I|re.S|re.M)

# coding=utf-8
import re

content = '''''
<td>
<a href="www.kklike.com" title="ab">abc</a>
<a href="www.kklike.com" title="cd">cde</a>
</td>
'''

# 获取<a href></a>之间的内容
print(u'获取链接文本内容:')
res = r'<a .*?>(.*?)</a>'
mm = re.findall(
    res, content, re.S | re.M)
for value in mm:
    print(value)

# 获取所有<a href></a>链接所有内容
print(u'\n获取完整链接内容:')
urls = re.findall(r"<a.*?href=.*?<\/a>", content, re.I | re.S | re.M)
for i in urls:
    print(i)

# 获取<a href></a>中的URL
print(u'\n获取链接中URL:')
res_url = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')"
link = re.findall(res_url, content, re.I | re.S | re.M)
for url in link:
    print(url)

获取链接文本内容:
abc
cde

获取完整链接内容:
<a href="www.kklike.com" title="ab">abc</a>
<a href="www.kklike.com" title="cd">cde</a>

获取链接中URL:
www.kklike.com
www.kklike.com

获取URL最后一个参数命名图片或传递参数

urls = "http://www.kklike.com/BbsImg141568417848931_640*640.jpg"
values = urls.split('/')[-1]
values

'BbsImg141568417848931_640*640.jpg'

url = 'http://www.kklike.com/test.py?a=hello&b=world'  
values = url.split('?')[-1]  
print(values)
for key_value in values.split('&'):
    print(key_value.split('='))

a=hello&b=world
['a', 'hello']
['b', 'world']

爬取网页中所有URL链接

# coding=utf-8
import re
import urllib

url = "http://www.kklike.com/"
content = urllib.request.urlopen(url).read().decode("utf8")
# print(content)
urls = re.findall(r"<a.*?href=.*?<\/a>", content, re.I)
print(urls[:3])

link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", content)
print(link_list[:3])

['<a href="http://www.kklike.com">kklike</a>', '<a href="/" title="首页"><img src="/images/finwod.png" alt="首页"/></a>', '<a href="/image.do" title="图片上传"><img src="/images/upload.png" alt="图片上传"></img></a>']
['/images/favicon.ico', '/styles/kklike.css', "/styles/kklike.css?v=' + now + '"]

爬取网页标题title两种方法

# coding=utf-8
import re
import urllib

url = "http://www.kklike.com/"
content = urllib.request.urlopen(url).read().decode("utf8")

title_pat = r'(?<=<title>).*?(?=</title>)'
title_ex = re.compile(title_pat, re.M | re.S)
title_obj = re.search(title_ex, content)
title = title_obj.group()
print(title)

title = re.findall(r'<title>(.*?)</title>', content)
print(title[0])

www.kklike.com--流着看看（图文爱好者）
www.kklike.com--流着看看（图文爱好者）

定位table位置并爬取属性-属性值

# coding=utf-8  
import re

content = '''sdf<table class="infobox vevent"><tr><td></td></tr></table>sdf ''';

start = content.find(r'<table class="infobox vevent"') #起点记录查询位置
end = content.find(r'</table>')
infobox = content[start:end]
print(infobox)

<table class="infobox vevent"><tr><td></td></tr>

s = '''<table>   
<tr>   
<td>序列号</td><td>DEIN3-39CD3-2093J3</td>   
<td>日期</td><td>2013年1月22日</td>   
<td>售价</td><td>392.70 元</td>   
<td>说明</td><td>仅限5用户使用</td>   
</tr>   
</table> 
'''
res = r'<td>(.*?)</td><td>(.*?)</td>'
m = re.findall(res, s, re.S | re.M)
for line in m:
    print(line[0],line[1])
    
# pandas.read_html()

序列号 DEIN3-39CD3-2093J3
日期 2013年1月22日
售价 392.70 元
说明 仅限5用户使用

过滤<span></span>等标签

在获取值过程中，通常会存在<span>、<br>、<a href>等标签，下面举个例子过滤。

<td><span class="nickname">(字) 翔宇</span></td>过滤标签

if "span" in nn: #处理标签<span>
    res_value = r'<span .*?>(.*?)</span>'
    m_value = re.findall(res_value,nn,re.S|re.M) 
    for value in m_value:
        print(value)

# coding=utf-8
import re

language = '''
<table class="infobox bordered vcard" style="width: 21em; font-size: 89%; text-align: left;" cellpadding="3">
<caption style="text-align: center; font-size: larger;" class="fn"><b>kklike</b></caption>
<tr><th>site：</th><td>kklike.com</td></tr>
<tr><th>title：</th><td> 图文爱好者</td></tr>
</tr>
</table>
'''

# 获取table中tr值
res_tr = r'<tr>(.*?)</tr>'
m_tr = re.findall(res_tr, language, re.S | re.M)
for line in m_tr:
    # 获取表格第一列th 属性
    res_th = r'<th>(.*?)</th>'
    m_th = re.findall(res_th, line, re.S | re.M)
    for mm in m_th:
        if "href" in mm:  # 如果获取加粗的th中含超链接则处理
            restr = r'<a href=.*?>(.*?)</a>'
            h = re.findall(restr, mm, re.S | re.M)
            print(h[0])
        else:
            print(mm,end=' ')

    # 获取表格第二列td 属性值
    res_td = r'<td>(.*?)</td>'  # r'<td .*?>(.*?)</td>'
    m_td = re.findall(res_td, line, re.S | re.M)
    for nn in m_td:
        if "href" in nn:  # 处理超链接<a href=../rel=..></a>
            res_value = r'<a .*?>(.*?)</a>'
            m_value = re.findall(res_value, nn, re.S | re.M)
            for value in m_value:
                print(value)
        elif "span" in nn:  # 处理标签<span>
            res_value = r'<span .*?>(.*?)</span>'
            m_value = re.findall(res_value, nn, re.S | re.M)  # <td><span class="nickname">(字) 翔宇</span></td>
            for value in m_value:
                print(value)
        else:
            print(nn)

site： kklike.com
title：  图文爱好者

获取<script></script>等标签内容

# coding=utf-8
import re
import os,urllib

content = '''''
<script>var images = [
{ "original":"http://shop.kklike.com/_image/upload/2014/107b4495-4860-4b76-b807-d7f81e27f4a8.jpg",
  "title":"","descript":"","id":75109},
{ "original":"http://shop.kklike.com/_image/upload/2014/107b4495-4860-4b76-b807-d7f81e27f4a8.jpg",
  "title":"","descript":"","id":75110},
</script>
'''

html_script = r'<script>(.*?)</script>'
m_script = re.findall(html_script, content, re.S | re.M)
for script in m_script:
    res_original = r'"original":"(.*?)"'  # 原图
    m_original = re.findall(res_original, script)
    for pic_url in m_original:
        print
        pic_url
        filename = os.path.basename(pic_url)  # 去掉目录路径,返回文件名
        urllib.request.urlretrieve(pic_url,  "image/"+filename)  # 下载图片

from IPython.display import display, Image,Video,HTML
display(Image('image/'+filename,width=200))

通过replace过滤<br />标签

if '<br />' in value:
    value = value.replace('<br />','')   #过滤该标签
    value = value.replace('\n',' ')         #换行空格替代 否则总换行

获取<img ../>中超链接及过滤<img>标签

re.findall('src="(.*?)"'

import re
test = '''<img alt="图文爱好者" src="../images/kklike.png" width="19" height="19" border="0" />'''
print(re.findall('src="(.*?)"',test))

['../images/kklike.png']

前军教程网

中小站长与DIV+CSS网页布局开发技术人员的首选CSS学习平台

python re 正则处理html（python 正则 \w）