首页 >后端开发 >Python教程 >python实现爬虫下载漫画示例

python实现爬虫下载漫画示例

WBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWB
WBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWB原创
2016-06-06 11:29:101971浏览

代码如下:


#!/usr/bin/python3.2
import os,socket
import urllib
import urllib.request,threading,time
import re,sys
global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2

weburl=''
floder=''
chapterbegin=0
currentthreadnum=0
threadcount=6


if len(sys.argv)>=3:
  weburl=sys.argv[1]
  floder=sys.argv[2]
else:
    print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6")
    sys.exit(0)
if len(sys.argv)>=4:
  chapterbegin=int(sys.argv[3])
if len(sys.argv)>=5:
  threadcount=(int)(sys.argv[4])

 

def jin(i,jinzhi):
        Finalans=""
        answer=i%jinzhi
        i=int(i/jinzhi)
        if answer>9:
            Finalans= ""
            Finalans=""
Finalans chr(ord('a') (answer-10))
         else:
               Finalans=finalans str(answer)
        如果我!=0 :
               Finalans=jin(i,jinzhi) Finalans
        return Finalans
def urlparse(p,a,c,k):
       d={}
        e=lambda c:     jin( c,36)
        如果 1:
                而 c:
                      c=c-1
                       如果不是 k[c]:
                            d[jin(c,36)]=jin(c ,36)                            d [jin(c,36)]=k[c]
               k=[lambda e:d[e]]
                 e=lambda c:'\w '
              c=1
        newstr= ""
        while c:
                 c=c-1
               if k[c]:
                        对于范围内的 i(0,len(p )):
                                                                            对于我
                           tempi=ord(tempi)
                              如果 tempi>=ord('a') 且 tempi                                                                                              来自 new (str) ( tempi)                            elif tempi>=ord('0') 和 tempi                                     newstr =d[chr(tempi)]
                              其他:
                                                                                                                                              一个
新闻的新闻() 🎜>        return newstr
def meipower(s):
         p=re.compile(r"(?=}\().*",re.IGNORECASE)
        s=p.findall(s)
        s=s[0]
        s=s[0:(len(s)-19)]
        par=s.split(',')
        par[3]=par[3][1:len(par[3])]
        answer=par[3].split('|')
        chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)
        allurl=re.findall('imgpath=[^;]*',chapterpath)[0]
        allurl=allurl[10:(len(allurl)-2)]
        return allurl
def pictofile(weburl,filename,loop=100):
        if loop<0:
print('can\'t download the picture %s'%weburl)
return
loop=loop-1
if os.path.exists(filename):
return
try:
url=urllib.request.urlopen(weburl)
data=url.read()
if len(data)<2048:
url.close()
pictofile(weburl,filename,loop)
else:
print('download from %s name is %s\n'%(weburl,filename))
myfile=open('%s'%filename,'wb')
myfile.write(data)
myfile.close()
url.close();
except socket.timeout:
print('timeout')
pictofile(weburl,filename,loop)
except Exception as e:
print('error',e)
pictofile(weburl,filename,loop)
finally:
pass
def downloadpic(url,loadpicdir,num):
#download the all url picture to loadpicdir
global currentthreadnum,mutex,mutex2
mymode=re.compile(r'[0-9a-z.]*\Z')
try:
mutex2.acquire()
os.chdir(loadpicdir)
mutex2.release()
except:
print("can't open the floder %s will be create"%loadpicdir)
try:
if(mutex2.locked()):
os.mkdir(loadpicdir)
os.chdir(loadpicdir)
mutex2.release()
print('create floder succeed')
except:
print("can't create floder %s"%loadpicdir)
if(mutex.acquire()):
mutex.release()
quit(0)
name=mymode.findall(url)
filename='manhua'+name[0]
pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)
mutex.acquire()
currentthreadnum=currentthreadnum-1
mutex.release()
def downloadchapter(url,loadpicdir,num,begin=0):
global manhuaweb,threadcount,currentthreadnum,mutex
print(manhuaweb+url)
webdata=urllib.request.urlopen(manhuaweb+url).read()
webdata=webdata.decode('UTF-8')
chaptername=re.findall(r'[^_]*',webdata)[0]<br>        chaptername=chaptername[7:len(chaptername)]<br>        webscrip=re.findall(r'eval.*[^<>]',webdata)<br>        chapterurl=meispower(webscrip[0]);<br>        chapterurl='http://mhimg.ali213.net'+chapterurl<br>        for i in range(begin,num):<br>                try:<br>                        while(currentthreadnum>=threadcount):<br>                                time.sleep(0.5)<br>                        mutex.acquire()<br>                        currentthreadnum=currentthreadnum+1<br>                        mutex.release()<br>                        threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start()<br>                except socket.error:<br>                        mutex.acquire()<br>                        i=i-1<br>                        currentthreadnum=currentthreadnum-1<br>                        mutex.release()<br>                except Exception as error:<br>                        print(error,'break')<br>                        print('download chapter %d of picture make a error'%i)<br>                        Break<br>if __name__=='__main__':<br>        manhuaweb=r'http://manhua.ali213.net'<br>        socket.setdefaulttimeout(60.0)<br>        mutex=threading.Lock()<br>        mutex2=threading.Lock()</p> <p>        <br>        webfile=urllib.request.urlopen(weburl)<br>        webdata=webfile.read();<br>        webdata=webdata.decode('UTF-8')<br>        meshmode=re.compile (r'</p> <div class="detail_body_right_sec_con">.*</div>')<br>        meshdata=meshmode.findall(webdata)[0]<br>        indexmode=re.compile(r'([0 -9]*页)')<br>        indexdata=indexmode.findall(meshdata) <p>        picurlmode=re.compile(r'/comic/[0-9/]*.html')<br>        picurldata=picurlmode.findall(meshdata)</p> <p><br>        Chapterlength=len(picurldata)<br>        nummode=re.compile(r'[d] ')</p> <p>        i=chapterbegin<br>        while i<chapterlength:>               manhuachapter=picurldata[chapterlength-i-1]<br>                downloadchapter(manhuachapter,floder,int(nummode) .findall(indexdata[章节长度-i-1] )[0]))<br>                i=i 1<br></chapterlength:></p></div><div class="nphpQianMsg"><div class="clear"></div></div><div class="nphpQianSheng"><span>声明:</span><div>本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系admin@php.cn</div></div></div><div class="nphpSytBox"><span>上一篇:<a class="dBlack" title="python基础教程之基本内置数据类型介绍" href="https://m.php.cn/zh/faq/86488.html">python基础教程之基本内置数据类型介绍</a></span><span>下一篇:<a class="dBlack" title="python求斐波那契数列示例分享" href="https://m.php.cn/zh/faq/86491.html">python求斐波那契数列示例分享</a></span></div><div class="nphpSytBox2"><div class="nphpZbktTitle"><h2>相关文章</h2><em><a href="https://m.php.cn/zh/article.html" class="bBlack"><i>查看更多</i><b></b></a></em><div class="clear"></div></div><ins class="adsbygoogle" style="display:block" data-ad-format="fluid" data-ad-layout-key="-6t+ed+2i-1n-4w" data-ad-client="ca-pub-5902227090019525" data-ad-slot="8966999616"></ins><script> (adsbygoogle = window.adsbygoogle || []).push({}); </script><ul class="nphpXgwzList"><li><b></b><a href="https://m.php.cn/zh/faq/83938.html" title="用Python编写web API的教程" class="aBlack">用Python编写web API的教程</a><div class="clear"></div></li><li><b></b><a href="https://m.php.cn/zh/faq/84354.html" title="在Python中操作文件之truncate()方法的使用教程" class="aBlack">在Python中操作文件之truncate()方法的使用教程</a><div class="clear"></div></li><li><b></b><a href="https://m.php.cn/zh/faq/84828.html" title="Python random模块常用方法" class="aBlack">Python random模块常用方法</a><div class="clear"></div></li><li><b></b><a href="https://m.php.cn/zh/faq/85807.html" title="Python and、or以及and-or语法总结" class="aBlack">Python and、or以及and-or语法总结</a><div class="clear"></div></li><li><b></b><a href="https://m.php.cn/zh/faq/85889.html" title="把项目从Python2.x移植到Python3.x的经验总结" class="aBlack">把项目从Python2.x移植到Python3.x的经验总结</a><div class="clear"></div></li></ul></div></div><ins class="adsbygoogle" style="display:block" data-ad-format="autorelaxed" data-ad-client="ca-pub-5902227090019525" data-ad-slot="5027754603"></ins><script> (adsbygoogle = window.adsbygoogle || []).push({}); </script><footer><div class="footer"><div class="footertop"><img src="/static/imghwm/logo.png" alt=""><p>公益在线PHP培训,帮助PHP学习者快速成长!</p></div><div class="footermid"><a href="https://m.php.cn/zh/about/us.html">关于我们</a><a href="https://m.php.cn/zh/about/disclaimer.html">免责声明</a><a href="https://m.php.cn/zh/update/article_0_1.html">Sitemap</a></div><div class="footerbottom"><p> © php.cn All rights reserved </p></div></div></footer><script>isLogin = 0;</script><script type="text/javascript" src="/static/layui/layui.js"></script><script type="text/javascript" src="/static/js/global.js?4.9.47"></script></div><script src="https://vdse.bdstatic.com//search-video.v1.min.js"></script><link rel='stylesheet' id='_main-css' href='/static/css/viewer.min.css' type='text/css' media='all'/><script type='text/javascript' src='/static/js/viewer.min.js?1'></script><script type='text/javascript' src='/static/js/jquery-viewer.min.js'></script><script>jQuery.fn.wait = function (func, times, interval) { var _times = times || -1, //100次 _interval = interval || 20, //20毫秒每次 _self = this, _selector = this.selector, //选择器 _iIntervalID; //定时器id if( this.length ){ //如果已经获取到了,就直接执行函数 func && func.call(this); } else { _iIntervalID = setInterval(function() { if(!_times) { //是0就退出 clearInterval(_iIntervalID); } _times <= 0 || _times--; //如果是正数就 -- _self = $(_selector); //再次选择 if( _self.length ) { //判断是否取到 func && func.call(_self); clearInterval(_iIntervalID); } }, _interval); } return this; } $("table.syntaxhighlighter").wait(function() { $('table.syntaxhighlighter').append("<p class='cnblogs_code_footer'><span class='cnblogs_code_footer_icon'></span></p>"); }); $(document).on("click", ".cnblogs_code_footer",function(){ $(this).parents('table.syntaxhighlighter').css('display','inline-table');$(this).hide(); }); $('.nphpQianCont').viewer({navbar:true,title:false,toolbar:false,movable:false,viewed:function(){$('img').click(function(){$('.viewer-close').trigger('click');});}}); </script></body><!-- Matomo --><script> var _paq = window._paq = window._paq || []; /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ _paq.push(['trackPageView']); _paq.push(['enableLinkTracking']); (function() { var u="https://tongji.php.cn/"; _paq.push(['setTrackerUrl', u+'matomo.php']); _paq.push(['setSiteId', '9']); var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); })(); </script><!-- End Matomo Code --></html>