Xem mã ví dụ bên dưới. Kịch bản trích xuất mã html của một trang web (ở đây trang chủ Python) và trích xuất tất cả các liên kết trong trang đó. Hi vọng điêu nay co ich.
#!/usr/bin/env python
import requests
from BeautifulSoup import BeautifulSoup
url = "http://www.python.org"
response = requests.get(url)
# parse html
page = str(BeautifulSoup(response.content))
def getURL(page):
"""
:param page: html of web page (here: Python home page)
:return: urls in that page
"""
start_link = page.find("a href")
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1: end_quote]
return url, end_quote
while True:
url, n = getURL(page)
page = page[n:]
if url:
print url
else:
break
Output:
/
#left-hand-navigation
#content-body
/search
/about/
/news/
/doc/
/download/
/getit/
/community/
/psf/
http://docs.python.org/devguide/
/about/help/
http://pypi.python.org/pypi
/download/releases/2.7.3/
http://docs.python.org/2/
/ftp/python/2.7.3/python-2.7.3.msi
/ftp/python/2.7.3/Python-2.7.3.tar.bz2
/download/releases/3.3.0/
http://docs.python.org/3/
/ftp/python/3.3.0/python-3.3.0.msi
/ftp/python/3.3.0/Python-3.3.0.tar.bz2
/community/jobs/
/community/merchandise/
/psf/donations/
http://wiki.python.org/moin/Languages
http://wiki.python.org/moin/Languages
http://www.google.com/calendar/ical/b6v58qvojllt0i6ql654r1vh00%40group.calendar.google.com/public/basic.ics
http://www.google.com/calendar/ical/j7gov1cmnqr9tvg14k621j7t5c%40group.calendar.google.com/public/basic.ics
http://pycon.org/#calendar
http://www.google.com/calendar/ical/3haig2m9msslkpf2tn1h56nn9g%40group.calendar.google.com/public/basic.ics
http://pycon.org/#calendar
http://www.psfmember.org
...