Home  >  Article  >  Backend Development  >  Python crawler crawls all movies of Tencent Video (code)

Python crawler crawls all movies of Tencent Video (code)

不言
不言forward
2018-10-12 15:12:588618browse

The content this article brings to you is about the python crawler crawling all the movies (code) of Tencent Video. It has certain reference value. Friends in need can refer to it. I hope it will be helpful to you.

Using python to crawl all movies of Tencent Video

# -*- coding: utf-8 -*-

import re

import urllib2

from bs4 import BeautifulSoup

import string, time

import pymongo

NUM     = 0         #全局变量,电影数量

m_type  = u''       #全局变量,电影类型

m_site  = u'qq' #全局变量,电影网站
#根据指定的URL获取网页内容

def gethtml(url):

    req = urllib2.Request(url)

    response = urllib2.urlopen(req)

    html = response.read()

    return html

#从电影分类列表页面获取电影分类

def gettags(html):

    global m_type

    soup = BeautifulSoup(html)      #过滤出分类内容

    #print soup

    #<ul class="clearfix _group" gname="mi_type" gtype="1">

    tags_all = soup.find_all(&#39;ul&#39;, {&#39;class&#39; : &#39;clearfix _group&#39; , &#39;gname&#39; : &#39;mi_type&#39;})

    #print len(tags_all), tags_all

    #print str(tags_all[1]).replace(&#39;\n&#39;, &#39;&#39;)

    #<a _hot="tag.sub" class="_gtag _hotkey" href="http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html"title="动作" tvalue="0">动作</a>

    re_tags = r&#39;<a _hot=\"tag\.sub\" class=\"_gtag _hotkey\" href=\"(.+?)\" title=\"(.+?)\" tvalue=\"(.+?)\">.+?</a>&#39;

    p = re.compile(re_tags, re.DOTALL)

    tags = p.findall(str(tags_all[0]))

    if tags:

        tags_url = {}

        #print tags

        for tag in tags:

            tag_url = tag[0].decode(&#39;utf-8&#39;)

            #print tag_url

            m_type = tag[1].decode(&#39;utf-8&#39;)

            tags_url[m_type] = tag_url
    else:

            print "Not Find"

    return tags_url

#获取每个分类的页数

def get_pages(tag_url):

    tag_html = gethtml(tag_url)

    #p class="paginator

    soup = BeautifulSoup(tag_html)      #过滤出标记页面的html

    #print soup

    #<p class="mod_pagenav" id="pager">

    p_page = soup.find_all(&#39;p&#39;, {&#39;class&#39; : &#39;mod_pagenav&#39;, &#39;id&#39; : &#39;pager&#39;})

    #print p_page #len(p_page), p_page[0]

    #<a class="c_txt6" href="http://v.qq.com/list/1_2_-1_-1_1_0_24_20_0_-1_0.html" title="25"><span>25</span></a>

    re_pages = r&#39;<a class=.+?><span>(.+?)</span></a>&#39;

    p = re.compile(re_pages, re.DOTALL)

    pages = p.findall(str(p_page[0]))

    #print pages

    if len(pages) > 1:

        return pages[-2]

    else:

        return 1

def getmovielist(html):

    soup = BeautifulSoup(html)

    #<ul class="mod_list_pic_130">

    ps = soup.find_all(&#39;ul&#39;, {&#39;class&#39; : &#39;mod_list_pic_130&#39;})

    #print ps

    for p_html in ps:

        p_html = str(p_html).replace(&#39;\n&#39;, &#39;&#39;)

        #print p_html

        getmovie(p_html)

def getmovie(html):

    global NUM

    global m_type

    global m_site
    re_movie = r&#39;<li><a class=\"mod_poster_130\" href=\"(.+?)\" target=\"_blank\" title=\"(.+?)\"><img.+?</li>&#39;

    p = re.compile(re_movie, re.DOTALL)

    movies = p.findall(html)

    if movies:

        conn = pymongo.Connection(&#39;localhost&#39;, 27017)

        movie_db = conn.dianying

        playlinks = movie_db.playlinks

        #print movies

        for movie in movies:

            #print movie

            NUM += 1

            print "%s : %d" % ("=" * 70, NUM)

            values = dict(

                movie_title = movie[1],

                movie_url   = movie[0],

                movie_site      = m_site,

                movie_type      = m_type

                )

            print values

            playlinks.insert(values)

            print "_" * 70

            NUM += 1

            print "%s : %d" % ("=" * 70, NUM)

  

    #else:

    #   print "Not Find"

  

def getmovieinfo(url):

    html = gethtml(url)

    soup = BeautifulSoup(html)

  

    #pack pack_album album_cover

    ps = soup.find_all(&#39;p&#39;, {&#39;class&#39; : &#39;pack pack_album album_cover&#39;})

    #print ps[0]

  

    #<a href="http://www.tudou.com/albumplay/9NyofXc_lHI/32JqhiKJykI.html" target="new" title="《血滴子》独家纪录片" wl="1"> </a>

    re_info = r&#39;<a href=\"(.+?)\" target=\"new\" title=\"(.+?)\" wl=\".+?\"> </a>&#39;

    p_info = re.compile(re_info, re.DOTALL)

    m_info = p_info.findall(str(ps[0]))

    if m_info:

        return m_info

    else:

        print "Not find movie info"

  

    return m_info

  

  

def insertdb(movieinfo):

    global conn

    movie_db = conn.dianying_at

    movies = movie_db.movies

    movies.insert(movieinfo)

  

if __name__ == "__main__":

    global conn

  

    tags_url = "http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html"

    #print tags_url

    tags_html = gethtml(tags_url)

    #print tags_html

    tag_urls = gettags(tags_html)

    #print tag_urls

  

  

    for url in tag_urls.items():

        print  str(url[1]).encode(&#39;utf-8&#39;) #,url[0]

        maxpage = int(get_pages(str(url[1]).encode(&#39;utf-8&#39;)))

        print maxpage

  

        for x in range(0, maxpage):

            #http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html

            m_url = str(url[1]).replace(&#39;0_20_0_-1_0.html&#39;, &#39;&#39;)

            movie_url = "%s%d_20_0_-1_0.html" % (m_url, x)

            print movie_url

            movie_html = gethtml(movie_url.encode(&#39;utf-8&#39;))

            #print movie_html

            getmovielist(movie_html)

            time.sleep(0.1)

The above is the detailed content of Python crawler crawls all movies of Tencent Video (code). For more information, please follow other related articles on the PHP Chinese website!

Statement:
This article is reproduced at:cnblogs.com. If there is any infringement, please contact admin@php.cn delete