2009-05-04 34 views

Trả lời

14
from BeautifulSoup import BeautifulSoup 

soup = BeautifulSoup(''' 
<html> 
    <head><title>Testing</title></head> 
    <body> 
    <a href="http://foo.com/">foo</a> 
    <a href="http://bar.com/bar">Bar</a> 
    </body> 
</html>''') 

for link in soup.findAll('a'): # find all links 
    link['href'] = link['href'] + '?foo' 

print soup 

Đó in:

<html> 
<head><title>Testing</title></head> 
<body> 
<a href="http://foo.com/?foo">foo</a> 
<a href="http://bar.com/bar?foo">Bar</a> 
</body> 
</html> 

Các documentation cũng có một số examples for changing attributes. Đây là một hướng dẫn mở rộng bao gồm tất cả các khía cạnh phổ biến của BeautifulSoup. Tôi không biết những gì còn thiếu trong tài liệu, có lẽ bạn nên làm rõ.

1

ví dụ của tôi:

HEADERS = {"User-Agent" : "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5", 
     "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 
     "Accept-Language" : "ru,en-us;q=0.7,en;q=0.3", 
     "Accept-Charset" : "windows-1251,utf-8;q=0.7,*;q=0.7", 
     "Accept-Encoding" : "identity, *;q=0", 
     "Connection" : "Keep-Alive"} 
PROXY=None 
timeout=60 


def parse_manuf_page_about(page_str_about): 
slovar={} 
global timeout 
socket.setdefaulttimeout(timeout) 
if PROXY is not None: 
     proxy_handler = urllib2.ProxyHandler({ "http": "http://"+PROXY+"/" }) 
     opener = urllib2.build_opener(proxy_handler) 
     urllib2.install_opener(opener) 
page_request = urllib2.Request(url=page_str_about, headers=HEADERS) 
try: 
    #print "Page reading ... %s" %page_str 
    page_zapr = urllib2.urlopen(url=page_request) 
    page=page_zapr.read() 
except Exception ,error: 
    print str(error) 
    res=False 
    return res,slovar 
soup = BeautifulSoup(page) 
select_pod=soup.findAll('div', {"class":"win aboutUs"}) 

promeg= select_pod[0].findAll("p")[0] 
zerro_br= promeg.findAll(text=True) 
Company_Info=" ".join(zerro_br).strip(" \t\n") 
select =soup.findAll('div', {"class":"win"}) 
cells_tabl= select[0].findAll("tr") 

for yach in cells_tabl: 
    text_zag=yach.findAll("th") 
    for zn_yach in text_zag: 
     if len(zn_yach)>0: 
      txt_zn_yach="".join(zn_yach.findAll(text=True)).strip(" \t\n") 
     else: 
      txt_zn_yach= zn_yach.contents[0].strip(" \t\n") 
      #print txt_zn_yach 
    text_znach_td=yach.findAll("td") 
    for zn_yach_td in text_znach_td: 
     if len(zn_yach_td)>0: 
      txt_zn_yach_td="".join(zn_yach_td.findAll(text=True)).strip(" \t\n") 
     else: 
      txt_zn_yach_td= zn_yach.contents[0].strip(" \t\n") 
      #print txt_zn_yach_td 
    # Делаем замены неугодных символов/Replase browsers char 
    if "&nbsp" in txt_zn_yach_td: 
     while txt_zn_yach_td.find("nbsp;")>0: 
      pos_gavna=txt_zn_yach_td.find("&nbsp;") 
      txt_zn_yach_td=txt_zn_yach_td[:pos_gavna]+txt_zn_yach_td[pos_gavna+6:] 
    if "&quot" in txt_zn_yach_td: 
     while txt_zn_yach_td.find("quot;")>0: 
      pos_gavna=txt_zn_yach_td.find("&quot;") 
      txt_zn_yach_td=txt_zn_yach_td[:pos_gavna]+'"'+txt_zn_yach_td[pos_gavna+6:] 
    if "&amp;" in txt_zn_yach_td: 
     while txt_zn_yach_td.find("&amp;")>0: 
      pos_gavna=txt_zn_yach_td.find("&amp;") 
      txt_zn_yach_td=txt_zn_yach_td[:pos_gavna]+'&'+txt_zn_yach_td[pos_gavna+6:] 
    slovar[str(txt_zn_yach)]=txt_zn_yach_td 
    slovar["Company_Info"]=Company_Info 
# разбираем нижнюю таблицу с контактом и вытаскиваем оттуда имя контакта | get name contacts 
select_contact=soup.findAll('a', {"class":"member-name"}) 
for contact_person in select_contact: 
    slovar["Contact_Person"]= contact_person.contents[0] 
# получаем статус голд партнера по наличию таблички в левом верхнем углу | get Gold status 
select_gold_part=soup.findAll('a', {"class":"memberLogo"}) 
if len(select_gold_part)==0: 
    slovar["Gold member"]="N" 
else: 
    slovar["Gold member"]="Y" 
res=True 
return res,slovar 

Mã này phân tích một trang của manufactury trên Alibaba.com. Bạn có thể xem trang đó - http://xmxinhuafeng.en.alibaba.com/aboutus.html

+0

Liệu thiết bị lưu giữ có thực sự hoạt động không? –

Các vấn đề liên quan