ICode9

精准搜索请尝试: 精确搜索
首页 > 其他分享> 文章详细

Beautiful Soup

2020-08-23 22:32:43  阅读:253  来源: 互联网

标签:Beautiful soup bs4 doc BeautifulSoup Soup html print


Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库.它能够通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式。

 

html_doc = """
<html>
  <head>
    <title>The Dormouse's story</title>
  </head>
  <body>
    <p class="title">
      <b>The Dormouse's story</b> 
        <span>eng</span>
      <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
    </p>
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
    and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
    and they lived at the bottom of a well.
    

    <p class="story">...</p>
"""
  标签选择器 选择元素
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
soup.title#返回是一个tag对象
# <title>The Dormouse's story</title>
print(soup.head)
# <head>
# <title>The Dormouse's story</title>
# </head>
print(soup.p)
# <p class="title">
#<b>The Dormouse's story</b>
#<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
#</p>
print(soup.a)
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

 

获取标签名称
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.title.name)
# title

 

获取标签属性
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.a['href'])
# http://example.com/elsie
print(soup.a.attrs['href'])
# http://example.com/elsie

 

获取标签文本内容
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.title.string)
# The Dormouse's story

 

嵌套选择
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.head.title.string)#返回是一个tag对象,可以在此之上继续选择
# The Dormouse's story

 

子节点
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.p.contents)#返回含有所有子节点tag对象的一个列表
#['\n', <b>The Dormouse's story</b>, '\n', <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, '\n']

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.p.children)#返回含有所有子节点tag对象的一个列表迭代器
#<list_iterator object at 0x037F8FE8>

for i, child in enumerate(soup.p.children):
    print(i,child)

#0

#1 <b>The Dormouse's story</b>
#2

#3 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
#4

 

子孙节点
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.p.contents)#返回含有所有子孙节点tag对象的一个列表
#['\n', <b>The Dormouse's story</b>, '\n', <span>eng</span>, '\n', <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, '\n']

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.p.descendants)#返回含有所有子节点tag对象的一个列表迭代器
#<generator object Tag.descendants at 0x0130B808>
for i, child in enumerate(soup.p.descendants):
    print(i,child)

#0

#1 <b>The Dormouse's story</b>
#2 The Dormouse's story
#3

#4 <span>eng</span>
#5 eng
#6

#7 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
#8 Elsie
#9

 

兄弟节点
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.a.next_sibling)#下一个兄弟节点tag对象
print(soup.a.next_siblings)#下面所有兄弟节点tag对象
print(list(enumerate(soup.a.next_siblings)))

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.a.previous_sibling)#上一个兄弟节点tag对象
print(soup.a.previous_siblings)#上面所有兄弟节点tag对象
print(list(enumerate(soup.a.previous_siblings)))

 

标准选择器 find_all( name , attrs , recursive , string , **kwargs )   name
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.find_all("a"))#返回所有a标签的tag对象的列表
#[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
print(soup.find_all('a')[0])
#<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

 

attrs
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.find_all("a"))#返回所有a标签的ta
print(soup.find_all(attrs = {'id':'link2'}))#定义一个字典参数来搜索包含特殊属性的ta
print(soup.find_all(id = 'link2'))#id、class_可以简写,class要加_
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

 

string
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.find_all("a", string="Elsie"))#返回所有字符串与 string 参数值相符的tag
# [<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>]

soup.find_all(string=["Tillie", "Elsie", "Lacie"]) # 列表
# ['Elsie', 'Lacie', 'Tillie']

soup.find_all(string=re.compile("Dormouse")) #正则表达式
# ["The Dormouse's story", "The Dormouse's story"]

#find( name , attrs , recursive , string , **kwargs )#返回第一个
#find_parents() 和 find_parent()
#find_next_siblings() 和 find_next_sibling()
#find_previous_siblings() 和 find_previous_sibling()
#find_all_next() 和 find_next()
#find_all_previous() 和 find_previous()

 

CSS选择器
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.select("title"))#传入字符串参数,返回是列表
# [<title>The Dormouse's story</title>]

print(soup.select("p:nth-of-type(3)"))
# [<p class="story">...</p>]

 

通过tag标签逐层查找
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.select("p a"))
#[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

 

找到某个tag标签下的直接子标签
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.select("p > a"))
#[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

 

获取属性
rom bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.select('a[href]'))
#[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
for a in soup.select('a'):
    print(a['id'])
    print(a.attrs['id'])#另外一种写法

#link1
#link2
#link3

 

获取文本内容 from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') for a in soup.select('a'):     print(a.get_text())
#Elsie #Lacie #Tillie   BeautifulSoup官方文档:https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/#id37

 

标签:Beautiful,soup,bs4,doc,BeautifulSoup,Soup,html,print
来源: https://www.cnblogs.com/jifou/p/13551131.html

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有