淘宝模拟登陆抓取失败
# __author__ = ''
# -*- coding: utf-8 -*-
import requests
import re
s = requests.session()
login_data = {'email': 'xxx', 'password': 'xxx', }
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
'Host':'log.mmstat.com',
'Referer':'https://www.taobao.com/'
}
# post 数据实现登录
s.post('https://login.taobao.com/member/login.jhtml?redirectURL=https%3A%2F%2Fwww.taobao.com%2F', login_data, headers=headers)
# 验证是否登陆成功,抓取'淘宝'首页看看内容
r = s.get('https://www.taobao.com')
print r.text
还是小白
用户名和密码省去嘞
得到的还是未登录时的代码,不知道自己少了什么,有成功的大神能告知一下咩
迷茫2017-04-17 17:34:45
Be careful to attach cookies when sending requests~
It is recommended to pay attention to the following points when simulating login:
Look at the request sent during normal login in the browser: #🎜🎜 #
Then when submitting the form, it is not only the user name and password, but also a hidden
field. This field is written in the hidden input when the login form is generated, so it must be extracted; #🎜🎜 #In the address of the post form, there is also a jsessionid field, which you also need to extract and add from the login page; lt
In short, I hope my solution ideas can give you some guidance.
When the server detects no abnormalities between your simulated login information and the normally submitted information, the login is successful~
Imitate normal login Action
, keep thinking about it in the network debugging tool of the browser.
Attached below is the simulated login crawler I wrote some time ago
#coding:utf-8
#!/usr/bin/python
import HTMLParser
import urlparse
import urllib
import urllib2
import cookielib
import string
import re
import sys
import time
reload(sys)
sys.setdefaultencoding('utf-8')
COOKIE_FLAG = True # true使用这次cookie false使用上次cookie
hosturl = "http://202.203.222.202/cas/login"
history = "http://202.203.222.202/myspace/reader/book/historyBorrow?pageSize=200&pageNo=1"
cj = cookielib.LWPCookieJar()
cookie_support = urllib2.HTTPCookieProcessor(cj)
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)
#获取 lt
def getHTML(stdNum):
cookie = cookielib.CookieJar()
handler=urllib2.HTTPCookieProcessor(cookie)
opener = urllib2.build_opener(handler)
response = opener.open(hosturl)
for item in cookie:
if item.name == "JSESSIONID":
jsessionId = item.value
loginHTML = response.read()
searchString = 'name="lt" value='
try:
index = loginHTML.index(searchString)
except Exception as err:
print "err:\t" + err
return False
lt = loginHTML[index +17:index +56]
# print "lt:\t" + lt
print "JD: "+jsessionId
targetPost = 'http://202.203.222.202/cas/login;jsessionid='+ jsessionId + '?service=http%3A%2F%2F202.203.222.202%2Fmyspace%2Freader%2Findex'
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1',
"Host": "202.203.222.202",
"Origin": "http://202.203.222.202",
"Referer": "http://202.203.222.202/cas/login?service=http://202.203.222.202/myspace/reader/index",
"Upgrade-Insecure-Requests": 1,
}
postData = {
"username":stdNum,
"password":stdNum[3:],
"lt": lt,
"execution": 'e1s1',
"_eventId": "submit",
"submit": "登录",
}
postData = urllib.urlencode(postData)
request = urllib2.Request(targetPost, postData, headers)
logResult = urllib2.urlopen(request).read().decode("utf8")
if "登录系统" in logResult:
print "失败啦!"
return "login failed "
historyBorrowPage = urllib2.urlopen(history)
historyHTML = historyBorrowPage.read().decode('utf8')
if "登录系统" in historyHTML:
return False
else:
logout = "http://202.203.222.202/myspace/reader/logout"
logoutYa = urllib2.Request(logout)
logoutPage = urllib2.urlopen(logoutYa)
return historyHTML
怪我咯2017-04-17 17:34:45
By the way, why is it Taobao’s address== Just use the cookie you logged in with. Zhihu seems to have a verification code too