过滤html
value = re.sub('<[^>]+>','', value)
import re
value = '''
<div> sdb</div>
'''
value = re.sub('<[^>]+>','', value) #过滤HTML标签
value
'\n sdb\n'
value.replace('\n',' ')
过滤乱码re.sub()
函数说明sub(pattern,repl,string)把字符串中的所有匹配表达式pattern中的地方替换成repl[^**]表示不匹配此字符集中的任何一个字符\u4e00-\u9fa5汉字的unicode范围\u0030-\u0039数字的unicode范围\u0041-\u005a大写字母unicode范围\u0061-\u007a小写字母unicode范围
import re
string = "北京大学beijing985大学@#¥……&{}*@$%)..+_)("
str = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])","",string)
print(str)
北京大学beijing985大学
# 提取中文
str = re.sub(u"([^\u4e00-\u9fa5])","",string)
print(str )
北京大学大学
# 提取中文和数字
str = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039])","",string)
print(str )
北京大学985大学
获取<tr></tr>标签之间内容
res_tr = r'<tr>(.*?)</tr>'
m_tr = re.findall(res_tr,language,re.S|re.M)
import re
language = '''<tr><th>性別:</th><td>男</td></tr><tr>'''
res_tr = r'<tr>(.*?)</tr>'
m_tr = re.findall(res_tr,language,re.S|re.M)
for line in m_tr:
print(line)
#获取表格第一列th 属性
res_th = r'<th>(.*?)</th>'
m_th = re.findall(res_th,line,re.S|re.M)
for mm in m_th:
print("<th>",mm)
#获取表格第二列td 属性值
res_td = r'<td>(.*?)</td>'
m_td = re.findall(res_td,line,re.S|re.M)
for nn in m_td:
print("<td>",nn)
<th>性別:</th><td>男</td>
<th> 性別:
<td> 男
获取超链接<a href=..></a>之间内容
res = r'<a .*?>(.*?)</a>'
mm = re.findall(res, content, re.S|re.M)
urls=re.findall(r"<a.*?href=.*?<\/a>", content, re.I|re.S|re.M)
# coding=utf-8
import re
content = '''''
<td>
<a href="www.kklike.com" title="ab">abc</a>
<a href="www.kklike.com" title="cd">cde</a>
</td>
'''
# 获取<a href></a>之间的内容
print(u'获取链接文本内容:')
res = r'<a .*?>(.*?)</a>'
mm = re.findall(
res, content, re.S | re.M)
for value in mm:
print(value)
# 获取所有<a href></a>链接所有内容
print(u'\n获取完整链接内容:')
urls = re.findall(r"<a.*?href=.*?<\/a>", content, re.I | re.S | re.M)
for i in urls:
print(i)
# 获取<a href></a>中的URL
print(u'\n获取链接中URL:')
res_url = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')"
link = re.findall(res_url, content, re.I | re.S | re.M)
for url in link:
print(url)
获取链接文本内容:
abc
cde
获取完整链接内容:
<a href="www.kklike.com" title="ab">abc</a>
<a href="www.kklike.com" title="cd">cde</a>
获取链接中URL:
www.kklike.com
www.kklike.com
获取URL最后一个参数命名图片或传递参数
urls = "http://www.kklike.com/BbsImg141568417848931_640*640.jpg"
values = urls.split('/')[-1]
values
'BbsImg141568417848931_640*640.jpg'
url = 'http://www.kklike.com/test.py?a=hello&b=world'
values = url.split('?')[-1]
print(values)
for key_value in values.split('&'):
print(key_value.split('='))
a=hello&b=world
['a', 'hello']
['b', 'world']
爬取网页中所有URL链接
# coding=utf-8
import re
import urllib
url = "http://www.kklike.com/"
content = urllib.request.urlopen(url).read().decode("utf8")
# print(content)
urls = re.findall(r"<a.*?href=.*?<\/a>", content, re.I)
print(urls[:3])
link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", content)
print(link_list[:3])
['<a href="http://www.kklike.com">kklike</a>', '<a href="/" title="首页"><img src="/images/finwod.png" alt="首页"/></a>', '<a href="/image.do" title="图片上传"><img src="/images/upload.png" alt="图片上传"></img></a>']
['/images/favicon.ico', '/styles/kklike.css', "/styles/kklike.css?v=' + now + '"]
爬取网页标题title两种方法
# coding=utf-8
import re
import urllib
url = "http://www.kklike.com/"
content = urllib.request.urlopen(url).read().decode("utf8")
title_pat = r'(?<=<title>).*?(?=</title>)'
title_ex = re.compile(title_pat, re.M | re.S)
title_obj = re.search(title_ex, content)
title = title_obj.group()
print(title)
title = re.findall(r'<title>(.*?)</title>', content)
print(title[0])
www.kklike.com--流着看看(图文爱好者)
www.kklike.com--流着看看(图文爱好者)
定位table位置并爬取属性-属性值
# coding=utf-8
import re
content = '''sdf<table class="infobox vevent"><tr><td></td></tr></table>sdf ''';
start = content.find(r'<table class="infobox vevent"') #起点记录查询位置
end = content.find(r'</table>')
infobox = content[start:end]
print(infobox)
<table class="infobox vevent"><tr><td></td></tr>
s = '''<table>
<tr>
<td>序列号</td><td>DEIN3-39CD3-2093J3</td>
<td>日期</td><td>2013年1月22日</td>
<td>售价</td><td>392.70 元</td>
<td>说明</td><td>仅限5用户使用</td>
</tr>
</table>
'''
res = r'<td>(.*?)</td><td>(.*?)</td>'
m = re.findall(res, s, re.S | re.M)
for line in m:
print(line[0],line[1])
# pandas.read_html()
序列号 DEIN3-39CD3-2093J3
日期 2013年1月22日
售价 392.70 元
说明 仅限5用户使用
过滤<span></span>等标签
在获取值过程中,通常会存在<span>、<br>、<a href>等标签,下面举个例子过滤。
<td><span class="nickname">(字) 翔宇</span></td>过滤标签
if "span" in nn: #处理标签<span>
res_value = r'<span .*?>(.*?)</span>'
m_value = re.findall(res_value,nn,re.S|re.M)
for value in m_value:
print(value)
# coding=utf-8
import re
language = '''
<table class="infobox bordered vcard" style="width: 21em; font-size: 89%; text-align: left;" cellpadding="3">
<caption style="text-align: center; font-size: larger;" class="fn"><b>kklike</b></caption>
<tr><th>site:</th><td>kklike.com</td></tr>
<tr><th>title:</th><td> 图文爱好者</td></tr>
</tr>
</table>
'''
# 获取table中tr值
res_tr = r'<tr>(.*?)</tr>'
m_tr = re.findall(res_tr, language, re.S | re.M)
for line in m_tr:
# 获取表格第一列th 属性
res_th = r'<th>(.*?)</th>'
m_th = re.findall(res_th, line, re.S | re.M)
for mm in m_th:
if "href" in mm: # 如果获取加粗的th中含超链接则处理
restr = r'<a href=.*?>(.*?)</a>'
h = re.findall(restr, mm, re.S | re.M)
print(h[0])
else:
print(mm,end=' ')
# 获取表格第二列td 属性值
res_td = r'<td>(.*?)</td>' # r'<td .*?>(.*?)</td>'
m_td = re.findall(res_td, line, re.S | re.M)
for nn in m_td:
if "href" in nn: # 处理超链接<a href=../rel=..></a>
res_value = r'<a .*?>(.*?)</a>'
m_value = re.findall(res_value, nn, re.S | re.M)
for value in m_value:
print(value)
elif "span" in nn: # 处理标签<span>
res_value = r'<span .*?>(.*?)</span>'
m_value = re.findall(res_value, nn, re.S | re.M) # <td><span class="nickname">(字) 翔宇</span></td>
for value in m_value:
print(value)
else:
print(nn)
site: kklike.com
title: 图文爱好者
获取<script></script>等标签内容
# coding=utf-8
import re
import os,urllib
content = '''''
<script>var images = [
{ "original":"http://shop.kklike.com/_image/upload/2014/107b4495-4860-4b76-b807-d7f81e27f4a8.jpg",
"title":"","descript":"","id":75109},
{ "original":"http://shop.kklike.com/_image/upload/2014/107b4495-4860-4b76-b807-d7f81e27f4a8.jpg",
"title":"","descript":"","id":75110},
</script>
'''
html_script = r'<script>(.*?)</script>'
m_script = re.findall(html_script, content, re.S | re.M)
for script in m_script:
res_original = r'"original":"(.*?)"' # 原图
m_original = re.findall(res_original, script)
for pic_url in m_original:
print
pic_url
filename = os.path.basename(pic_url) # 去掉目录路径,返回文件名
urllib.request.urlretrieve(pic_url, "image/"+filename) # 下载图片
from IPython.display import display, Image,Video,HTML
display(Image('image/'+filename,width=200))
通过replace过滤<br />标签
if '<br />' in value:
value = value.replace('<br />','') #过滤该标签
value = value.replace('\n',' ') #换行空格替代 否则总换行
获取<img ../>中超链接及过滤<img>标签
re.findall('src="(.*?)"'
import re
test = '''<img alt="图文爱好者" src="../images/kklike.png" width="19" height="19" border="0" />'''
print(re.findall('src="(.*?)"',test))
['../images/kklike.png']