我在抓取新浪微博一个“科技”相关的人物微博时,已经完成登录,get页面返回的数据比较混乱,用beautifulsoup.prettify()问题依旧。还请大神们帮忙看看是不是我的处理方式有问题。
本人菜鸟一枚。
贴上代码:
import sys
import urllib
import urllib2
import cookielib
import base64
import re
import json
import hashlib
import os
import rsa
import binascii
import time
import requests
import bs4
import redis
import pdb
import HTMLParser
reload(sys)
sys.setdefaultencoding('utf-8')
r=redis.Redis(host='localhost',port=6379,db=0)
r.delete('user_pool')
r.lpush('user_pool','fuckyouasshole')
r.delete('fans_pool')
r.sadd('fans_pool','fuckyouasshole')
weiboSession=requests.Session()
file = open('test.txt','w')
parameters = {
'entry': 'weibo',
'callback': 'sinaSSOController.preloginCallBack',
'su': 'bGFpcmVuMjAwNg%3D%3D',
'rsakt': 'mod',
'checkpin': '1',
'client': 'ssologin.js(v1.4.5)',
'_': '1457327347813'
}
postdata = {
'entry': 'weibo',
'gateway': '1',
'from': '',
'savestate': '7',
'useticket': '1',
'pagerefer': 'http%3A%2F%2Flogin.sina.com.cn%2Fsso%2Flogout.php%3Fentry%3Dminiblog%26r%3Dhttp%253A%252F%252Fweibo.com%252Flogout.php%253Fbackurl%253D%25252F',
'vsnf': '1',
'su': '',
'service': 'miniblog',
'servertime': '',
'nonce': '',
'pwencode': 'rsa2',
'rsakv': '',
'sp': '',
'encoding': 'UTF-8',
'prelt': '147',
'url': 'http://www.weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack&returntype=META'
}
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20100101 Firefox/8.0',
'Accept-Encoding':'deflate, sdch'}
def get_servertime():
url = 'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=bGFpcmVuMjAwNg%3D%3D&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.18)&_=1457327347813'
#data = urllib2.urlopen(url).read()
data=weiboSession.get(url).content
p = re.compile('\((.*)\)')
try:
json_data = p.search(data).group(1)
data = json.loads(json_data)
servertime = str(data['servertime'])
nonce = data['nonce']
pubkey = data['pubkey']
rsakv = data['rsakv']
return servertime, nonce, pubkey, rsakv
except:
print 'Get severtime error!'
return None
def get_pwd(pwd, servertime, nonce, pubkey):
rsaPublickey = int(pubkey, 16)
key = rsa.PublicKey(rsaPublickey, 65537) #创建公钥
message = str(servertime) + '\t' + str(nonce) + '\n' + str(pwd) #拼接明文 js加密文件中得到
passwd = rsa.encrypt(message, key) #加密
passwd = binascii.b2a_hex(passwd) #将加密信息转换为16进制
return passwd
def get_user(username):
username_ = urllib.quote(username)
username = base64.encodestring(username_)[:-1]
return username
def login(username, pwd):
url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.5)'
try:
servertime, nonce, pubkey, rsakv = get_servertime()
except:
return
global postdata
postdata['nonce'] = nonce
postdata['rsakv'] = rsakv
postdata['su'] = get_user(username)
postdata['sp'] = get_pwd(pwd, servertime, nonce, pubkey)
postdata['servertime'] = time.time()
result=weiboSession.post(url,headers=headers,data=postdata)
text = result.content
p = re.compile('location\.replace\(\'(.*?)\'\)')
try:
login_url = p.search(text).group(1)
response=weiboSession.get(login_url,headers=headers)
print u"登录成功!"
except:
print 'Login error!'
login('@163.com','**')
for i in range(0,1):
while(True):
try:
if i==0:
response=weiboSession.get('http://d.weibo.com/1087030002_2975_2009_0#',headers=headers)
else:
response=weiboSession.get('http://d.weibo.com/1087030002_2975_2009_0?page=%d#Pl_Core_F4RightUserList__4'%i,headers=headers)
s=response.content.decode("string_escape")
if re.search(r'<li class=(.*?)li>',s):
print re.search(r'<li class=\"follow_item S_line2\">.*?<\\\/li>',s).group(0)
soup_page=bs4.BeautifulSoup(s,"html.parser")#,
file.write(soup_page.prettify())
file.close()
#pdb.set_trace()
for userlist in soup_page.find('ul',class_='follow_list').find_all('li'):
user_url=userlist.dl.dd.p.a['href']
#print user_url
r.lpush('user_pool',user_url)
break
except Exception,e:
print Exception,':',e
#time.sleep(10)
pass
print r.rpop('user_pool')
file记录的内容排版非常混乱,导致用
soup_page.find('ul',class_='follow_list').find_all('li') 没法找到follow_list。提示:'None Type'object has no attribute 'find_all'