与正则表达式、xpath一样,BeautifulSoup()是用来从HTML或XML文件中提取数据的网页信息提取库
pip install lxml
pip install bs4
html_doc = """
<html><head><title>The Dormouse s story</title></head>
<body>
<p class="title"><b>The Dormouse s story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, lxml )
获取title标签
print(soup.title)
获取title标签的值
print(soup.title.string)
打印html全文
print(soup)
美丽打印thml全文
print(soup.prettify())
查找所有The Dormouse s story的值
trs = soup.find_all([ b , title ])
for tr in trs:
print(tr.string)
查找所有href,id值
trs =soup.find_all( a )
for tr in trs:
print(tr.get( id ))
print(tr.get( href ))
获取HTML全文的值
trs = soup.html.stripped_strings
for tr in trs:
print(tr)
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse s story</title></head> <body> <p class="title"><b>The Dormouse s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html_doc, lxml ) #获取title标签 print(soup.title) # #获取title标签的值 print(soup.title.string) # #打印html全文 print(soup) # #美丽打印thml全文 print(soup.prettify()) #查找所有The Dormouse s story的值 trs = soup.find_all([ b , title ]) for tr in trs: print(tr.string) #查找所有href,id值 trs =soup.find_all( a ) for tr in trs: print(tr.get( id )) print(tr.get( href ))
• tag : 标签
soup = BeautifulSoup(html_doc, lxml ) print(type(soup.p)) 运行结果: <class bs4.element.Tag >
• NavigableString : 可导航的字符串
soup = BeautifulSoup(html_doc, lxml ) bs_tg = soup.p.string print(bs_tg,type(bs_tg)) 运行结果: The Dormouse s story <class bs4.element.NavigableString >
• BeautifulSoup : bs对象
soup = BeautifulSoup(html_doc, lxml ) print(type(soup)) 运行结果 <class bs4.BeautifulSoup >
• Comment : 注释
bs里面有三种情况,第一个是遍历,第二个是查找,第三个是修改
contents children descendants
• contents 返回的是一个所有子节点的列表
如下例,返回一个p标签下所有子节点的列表
soup = BeautifulSoup(html_doc, lxml ) print(soup.p.contents) 运行结果 [<b>The Dormouse s story</b>]
• children 返回的是一个子节点的迭代器
如下例,返回一个a标签下子节点的迭代器,通过for遍历
soup = BeautifulSoup(html_doc, lxml ) tg = soup.a.children for t in tg: print(t) 运行结果: Elsie
• descendants 返回的是一个生成器遍历子子孙孙
soup = BeautifulSoup(html_doc, lxml ) tg = soup.html.descendants for t in tg: print(t) print( - * 80) 运行结果 <head><title>The Dormouse s story</title></head> -------------------------------------------------------------------------------- <title>The Dormouse s story</title> -------------------------------------------------------------------------------- The Dormouse s story -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- <body> <p class="title"><b>The Dormouse s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> </body> -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- <p class="title"><b>The Dormouse s story</b></p> -------------------------------------------------------------------------------- <b>The Dormouse s story</b> -------------------------------------------------------------------------------- The Dormouse s story -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- <p class="story">Once upon a time there were three little sisters; and their names were <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> -------------------------------------------------------------------------------- Once upon a time there were three little sisters; and their names were -------------------------------------------------------------------------------- <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> -------------------------------------------------------------------------------- Elsie -------------------------------------------------------------------------------- , -------------------------------------------------------------------------------- <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> -------------------------------------------------------------------------------- Lacie -------------------------------------------------------------------------------- and -------------------------------------------------------------------------------- <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> -------------------------------------------------------------------------------- Tillie -------------------------------------------------------------------------------- ; and they lived at the bottom of a well. -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- <p class="story">...</p> -------------------------------------------------------------------------------- ... -------------------------------------------------------------------------------- --------------------------------------------------------------------------------
• string获取标签里面的内容
soup = BeautifulSoup(html_doc, lxml ) tg = soup.a.string print(tg) 运行结果 Elsie
• strings 返回是一个生成器对象用过来获取多个标签内容
soup = BeautifulSoup(html_doc, lxml ) tg = soup.html.strings for t in tg: print(t)
• stripped_strings 和strings基本一致 但是它可以把多余的空格去掉
soup = BeautifulSoup(html_doc, lxml ) tg = soup.html.stripped_strings for t in tg: print(t) 运行结果: The Dormouse s story The Dormouse s story Once upon a time there were three little sisters; and their names were Elsie , Lacie and Tillie ; and they lived at the bottom of a well. ...
parent 和 parents
• parent直接获得父节点
• parents获取所有的父节点
soup = BeautifulSoup(html_doc, lxml ) tg = soup.a.parents for t in tg: print(t) print( - *60) 运行结果 <p class="story">Once upon a time there were three little sisters; and their names were <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> ------------------------------------------------------------ <body> <p class="title"><b>The Dormouse s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> </body> ------------------------------------------------------------ <html><head><title>The Dormouse s story</title></head> <body> <p class="title"><b>The Dormouse s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> </body></html> ------------------------------------------------------------ <html><head><title>The Dormouse s story</title></head> <body> <p class="title"><b>The Dormouse s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> </body></html> ------------------------------------------------------------
• next_sibling 下一个兄弟结点
• previous_sibling 上一个兄弟结点
• next_siblings 下一个所有兄弟结点
• previous_siblings上一个所有兄弟结点
soup = BeautifulSoup(html_doc, lxml ) tg = soup.a.previous_sibling print(tg)
find_all()
• find_all()方法以列表形式返回所有的搜索到的标签数据
• find()方法返回搜索到的第一条数据
• find_all()方法参数
def find_all(self, name=None, attrs={}, recursive=True, text=None,
limit=None, **kwargs):
• name : tag名称
• attr : 标签的属性
• recursive : 是否递归搜索
• text : 文本内容
• limli : 限制返回条数
• kwargs : 关键字参数
html = """
<table class="tablelist" cellpadding="0" cellspacing="0">
<tbody>
<tr class="h">
<td class="l" width="374">职位名称</td>
<td>职位类别</td>
<td>人数</td>
<td>地点</td>
<td>发布时间</td>
</tr>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=33824&keywords=python&tid=87&lid=2218">22989-金融云区块链高级研发工程师(深圳)</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-25</td>
</tr>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=29938&keywords=python&tid=87&lid=2218">22989-金融云高级后台开发</a></td>
<td>技术类</td>
<td>2</td>
<td>深圳</td>
<td>2017-11-25</td>
</tr>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=31236&keywords=python&tid=87&lid=2218">SNG16-腾讯音乐运营开发工程师(深圳)</a></td>
<td>技术类</td>
<td>2</td>
<td>深圳</td>
<td>2017-11-25</td>
</tr>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=31235&keywords=python&tid=87&lid=2218">SNG16-腾讯音乐业务运维工程师(深圳)</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-25</td>
</tr>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=34531&keywords=python&tid=87&lid=2218">TEG03-高级研发工程师(深圳)</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=34532&keywords=python&tid=87&lid=2218">TEG03-高级图像算法研发工程师(深圳)</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=31648&keywords=python&tid=87&lid=2218">TEG11-高级AI开发工程师(深圳)</a></td>
<td>技术类</td>
<td>4</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=32218&keywords=python&tid=87&lid=2218">15851-后台开发工程师</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=32217&keywords=python&tid=87&lid=2218">15851-后台开发工程师</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
<tr class="odd">
<td class="l square"><a id="test" class="test" target= _blank href="position_detail.php?id=34511&keywords=python&tid=87&lid=2218">SNG11-高级业务运维工程师(深圳)</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
</tbody>
</table>
"""
查找所有tr标签
trs = soup.find_all( tr )
for tr in trs:
print(tr)
查找所有tr标签,除了第一个
soup = BeautifulSoup(html, lxml )
trs = soup.find_all( tr )[1:]
for tr in trs:
print(tr)
print( - *60)
查找所有class="odd"的tr标签
soup = BeautifulSoup(html, lxml )
trs = soup.find_all( tr ,class_="odd")
for tr in trs:
print(tr)
print( - *60)
查找所有的工作岗位
for tr in trs:
tds = tr.find_all( td )
job = tds[0].string
需求:html解析获取所有工作岗位名称
方法:
1、与xpath思路一样,所有的工作岗位名称都在tr标签(除了第一个)内,所以要先围定一个大的范围。trs = soup.find_all( tr )[1:]
2、岗位名称在tr标签下的td标签里。tr标签下有5个td标签,其中岗位名称在第一个标签。
trs = soup.find_all( tr )[1:]
for tr in trs:
tds = tr.find_all( td )
job = tds[0].string
from bs4 import BeautifulSoup import csv class Spider(): def __init__(self): self.html=""" <table class="tablelist" cellpadding="0" cellspacing="0"> <tbody> <tr class="h"> <td class="l" width="374">职位名称</td> <td>职位类别</td> <td>人数</td> <td>地点</td> <td>发布时间</td> </tr> <tr class="even"> <td class="l square"><a target="_blank" href="position_detail.php?id=33824&keywords=python&tid=87&lid=2218">22989-金融云区块链高级研发工程师(深圳)</a></td> <td>技术类</td> <td>1</td> <td>深圳</td> <td>2017-11-25</td> </tr> <tr class="odd"> <td class="l square"><a target="_blank" href="position_detail.php?id=29938&keywords=python&tid=87&lid=2218">22989-金融云高级后台开发</a></td> <td>技术类</td> <td>2</td> <td>深圳</td> <td>2017-11-25</td> </tr> <tr class="even"> <td class="l square"><a target="_blank" href="position_detail.php?id=31236&keywords=python&tid=87&lid=2218">SNG16-腾讯音乐运营开发工程师(深圳)</a></td> <td>技术类</td> <td>2</td> <td>深圳</td> <td>2017-11-25</td> </tr> <tr class="odd"> <td class="l square"><a target="_blank" href="position_detail.php?id=31235&keywords=python&tid=87&lid=2218">SNG16-腾讯音乐业务运维工程师(深圳)</a></td> <td>技术类</td> <td>1</td> <td>深圳</td> <td>2017-11-25</td> </tr> <tr class="even"> <td class="l square"><a target="_blank" href="position_detail.php?id=34531&keywords=python&tid=87&lid=2218">TEG03-高级研发工程师(深圳)</a></td> <td>技术类</td> <td>1</td> <td>深圳</td> <td>2017-11-24</td> </tr> <tr class="odd"> <td class="l square"><a target="_blank" href="position_detail.php?id=34532&keywords=python&tid=87&lid=2218">TEG03-高级图像算法研发工程师(深圳)</a></td> <td>技术类</td> <td>1</td> <td>深圳</td> <td>2017-11-24</td> </tr> <tr class="even"> <td class="l square"><a target="_blank" href="position_detail.php?id=31648&keywords=python&tid=87&lid=2218">TEG11-高级AI开发工程师(深圳)</a></td> <td>技术类</td> <td>4</td> <td>深圳</td> <td>2017-11-24</td> </tr> <tr class="odd"> <td class="l square"><a target="_blank" href="position_detail.php?id=32218&keywords=python&tid=87&lid=2218">15851-后台开发工程师</a></td> <td>技术类</td> <td>1</td> <td>深圳</td> <td>2017-11-24</td> </tr> <tr class="even"> <td class="l square"><a target="_blank" href="position_detail.php?id=32217&keywords=python&tid=87&lid=2218">15851-后台开发工程师</a></td> <td>技术类</td> <td>1</td> <td>深圳</td> <td>2017-11-24</td> </tr> <tr class="odd"> <td class="l square"><a id="test" class="test" target= _blank href="position_detail.php?id=34511&keywords=python&tid=87&lid=2218">SNG11-高级业务运维工程师(深圳)</a></td> <td>技术类</td> <td>1</td> <td>深圳</td> <td>2017-11-24</td> </tr> </tbody> </table> """ def parser(self): soup = BeautifulSoup(self.html, lxml ) trs = soup.find_all( tr )[1:] lsts= [] for tr in trs: td = tr.find_all( td ) job = td[0].string joburl =td[0].a.get( href ) lst = [job,joburl] lsts.append(lst) return lsts def writedate(self,lsts): header=[ 招聘岗位 , 发布网址 ] with open( 招聘信息.csv , w ,encoding= utf-8 ,newline= )as f: writer = csv.writer(f) writer.writerow(header) writer.writerows(lsts) def main(self): lsts = self.parser() self.writedate(lsts) if __name__ == __main__ : s =Spider() s.main()