from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen("http://www.pythonscraping.com/pages/page1.html") bsObj = BeautifulSoup(html.read()) print(bsObj.h1)
try: html = urlopen("http://www.pythonscraping.com/pages/page1.html") except HTTPError as e: print(e) # 返回空值,中断程序,或者执行另一个方案 else: # 程序继续。注意:如果你已经在上面异常捕捉那一段代码里返回或中断(break), # 那么就不需要使用else语句了,这段代码也不会执行
处理beautifulSoup异常
1 2 3 4 5 6 7 8 9
try: badContent = bsObj.nonExistingTag.anotherTag except AttributeError as e: print("Tag was not found") else: if badContent == None: print ("Tag was not found") else: print(badContent)
复杂HTML解析
使用beautifulSoup抓取特定css属性
1 2 3 4 5
html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html") bsObj = BeautifulSoup(html) nameList = bsObj.findAll("span", {"class":"green"}) for name in nameList: print(name.get_text())
使用beautifulSoup处理html标签树
获取子标签 children()
1 2 3 4
html = urlopen("http://www.pythonscraping.com/pages/page3.html") bsObj = BeautifulSoup(html) for child in bsObj.find("table",{"id":"giftList"}).children: print(child)
获取自身之后的兄弟标签 next_siblings()
获取除了自身以外的兄弟标签,同时只能获取自身之后的兄弟标签。
1 2 3 4
html = urlopen("http://www.pythonscraping.com/pages/page3.html") bsObj = BeautifulSoup(html) for sibling in bsObj.find("table",{"id":"giftList"}).tr.next_siblings: print(sibling)
html = urlopen("http://www.pythonscraping.com/pages/page3.html") bsObj = BeautifulSoup(html) images = bsObj.findAll("img",{"src":re.compile("\.\.\/img\/gifts/img.*\.jpg")}) for image in images: print(image["src"])
params = {'username': 'Ryan', 'password': 'password'} r = requests.post("http://pythonscraping.com/pages/cookies/welcome.php", params) print("Cookie is set to:") print(r.cookies.get_dict()) print("-----------") print("Going to profile page...") r = requests.get("http://pythonscraping.com/pages/cookies/profile.php",cookies=r.cookies) print(r.text)
处理Session
使用 requests库跟踪session
1 2 3 4 5 6 7 8 9 10
import requests session = requests.Session() params = {'username': 'username', 'password': 'password'} s = session.post("http://pythonscraping.com/pages/cookies/welcome.php", params) print("Cookie is set to:") print(s.cookies.get_dict()) print("-----------") print("Going to profile page...") s = session.get("http://pythonscraping.com/pages/cookies/profile.php") print(s.text)
修改header
为了让请求更像是浏览器发出的,需要修改请求头
1 2 3 4 5 6 7 8 9 10
import requests from bs4 import BeautifulSoup
session = requests.Session() headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"} url = "https://www.whatismybrowser.com/developers/what-http-headers-is-my-browser-sending" req = session.get(url, headers=headers) bsObj = BeautifulSoup(req.text) print(bsObj.find("table",{"class":"table-striped"}).get_text)