Rumah > Soal Jawab > teks badan
PHP中文网2017-04-17 14:29:26
str.find()
都可以一般网页以上两点就可以了,对于ajax请求的网站,你可能爬不到想要内容,去找他的Api可能更方便。
高洛峰2017-04-17 14:29:26
直接给题主贴一个可以使用的抓取脚本吧,目的是获取豆瓣正在上映影片的豆瓣id和影片标题,脚本依赖于beautifulsoup库,需要安装,beautifulsoup中文文档
补充:如果题主是希望构建一个能对站点进行抓取或者可以自定义抓取指定页面这类真正的爬虫程序的话,还是推荐题主研究 scrapy
抓取python示例代码:
#!/usr/bin/env python
#coding:UTF-8
import urllib
import urllib2
import traceback
from bs4 import BeautifulSoup
from lxml import etree as ET
def fetchNowPlayingDouBanInfo():
doubaninfolist = []
try:
#使用proxy时,请取消屏蔽
# proxy_handler = urllib2.ProxyHandler({"http" : '172.23.155.73:8080'})
# opener = urllib2.build_opener(proxy_handler)
# urllib2.install_opener(opener)
url = "http://movie.douban.com/nowplaying/beijing/"
#设置http-useragent
useragent = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36'}
req = urllib2.Request(url, headers=useragent)
page = urllib2.urlopen(req, timeout=10)
html_doc = page.read()
soup = BeautifulSoup(html_doc, "lxml")
try:
nowplaying_ul = soup.find("p", id="nowplaying").find("ul", class_="lists")
lilist = nowplaying_ul.find_all("li", class_="list-item")
for li in lilist:
doubanid = li["id"]
title = li["data-title"]
doubaninfolist.append({"douban_id" : doubanid, "title" : title, "coverinfolist" : [] })
except TypeError, e:
print('(%s)TypeError: %s.!' % (url, traceback.format_exc()))
except Exception:
print('(%s)generic exception: %s.' % (url, traceback.format_exc()))
except urllib2.HTTPError, e:
print('(%s)http request error code - %s.' % (url, e.code))
except urllib2.URLError, e:
print('(%s)http request error reason - %s.' % (url, e.reason))
except Exception:
print('(%s)http request generic exception: %s.' % (url, traceback.format_exc()))
return doubaninfolist
if __name__ =="__main__":
doubaninfolist = fetchNowPlayingDouBanInfo()
print doubaninfolist
巴扎黑2017-04-17 14:29:26
简单的,不用框架的,可以看看requests和beautifulsoup这两个库,如果熟悉python语法,看完这两个,差不多能写个简单的爬虫了。
一般公司搞爬虫,我见过的,多用java或者python。
PHP中文网2017-04-17 14:29:26
网终上确实有许多的关于Python如何写一个简单爬虫的文章,但这些文章大多只能算是一个例子,能真正应用的还是挺少的。爬虫我认为就是获取内容、分析内容、再存储就OK了,如果只是才接触的话,可以直接Google之就行了。如果是深入的研究的话,可以在Github上找找代码来看下。
我自己对于Python也只是一知半解,希望有所帮助。
迷茫2017-04-17 14:29:26
贴一段爬天猫的代码:
def areaFlow(self, parturl, tablename, date):
while True:
url = parturl + self.lzSession + '&days=' + str(date) + '..' + str(date)
print url
try:
html = urllib2.urlopen(url, timeout=30)
except Exception, ex:
writelog(str(ex))
writelog(str(traceback.format_exc()))
break;
responegbk = html.read()
try:
respone = responegbk.encode('utf8')
except Exception, ex:
writelog(str(ex))
# 如果lzSession过期则会返回errcode:500的错误
if respone.find('"errcode":500') != -1:
print 'nodata'
break;
# 如果时间不对则返回errcode:100的错误
elif respone.find('"errcode":100') != -1:
print 'login error'
self.catchLzsession()
else:
try:
resstr = re.findall(r'(?<=\<)(.*?)(?=\/>)', respone, re.S)
writelog('地域名称 浏览量 访问量')
dictitems = []
for iarea in resstr:
items = {}
areaname = re.findall(r'(?<=name=\\\")(.*?)(?=\\\")', iarea, re.S)
flowamount = re.findall(r'(?<=浏览量:)(.*?)(?=<)', iarea, re.S)
visitoramount = re.findall(r'(?<=访客数:)(.*?)(?=\\\")', iarea, re.S)
print '%s %s %s' % (areaname[0], flowamount[0], visitoramount[0])
items['l_date'] = str(self.nowDate)
items['vc_area_name'] = str(areaname[0])
items['i_flow_amount'] = str(flowamount[0].replace(',', ''))
items['i_visitor_amount'] = str(visitoramount[0].replace(',', ''))
items['l_catch_datetime'] = str(self.nowTime)
dictitems.append(items)
writeInfoLog(dictitems)
insertSqlite(self.sqlite, tablename, dictitems)
break
except Exception,ex:
writelog(str(ex))
writelog(str(traceback.format_exc()))
time.sleep(1)