Home >Backend Development >Python Tutorial >Python爬取读者并制作成PDF

Python爬取读者并制作成PDF

WBOY
WBOYOriginal
2016-06-10 15:17:291032browse

学了下beautifulsoup后,做个个网络爬虫,爬取读者杂志并用reportlab制作成pdf..

crawler.py

复制代码 代码如下:

#!/usr/bin/env python
#coding=utf-8
"""
    Author:         Anemone
    Filename:       getmain.py
    Last modified:  2015-02-19 16:47
    E-mail:         anemone@82flex.com
"""
import urllib2
from bs4 import BeautifulSoup
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def getEachArticle(url):
#    response = urllib2.urlopen('http://www.52duzhe.com/2015_01/duzh20150104.html')
    response = urllib2.urlopen(url)
    html = response.read()
    soup = BeautifulSoup(html)#.decode("utf-8").encode("gbk"))
    #for i in soup.find_all('div'):
    #    print i,1
    title=soup.find("h1").string
    writer=soup.find(id="pub_date").string.strip()
    _from=soup.find(id="media_name").string.strip()
    text=soup.get_text()#.encode("utf-8")
    main=re.split("BAIDU_CLB.*;",text)
    result={"title":title,"writer":writer,"from":_from,"context":main[1]}
    return result
    #new=open("new.txt","w")
    #new.write(result["title"]+"\n\n")
    #new.write(result["writer"]+"  "+result["from"])
    #new.write(result["context"])
    #new.close()
def getCatalog(issue):
    url="http://www.52duzhe.com/"+issue[:4]+"_"+issue[-2:]+"/"
    firstUrl=url+"duzh"+issue+"01.html"
    firstUrl=url+"index.html"
    duzhe=dict()
    response = urllib2.urlopen(firstUrl)
    html = response.read()
    soup=BeautifulSoup(html)
    firstUrl=url+soup.table.a.get("href")
    response = urllib2.urlopen(firstUrl)
    html = response.read()
    soup = BeautifulSoup(html)
    all=soup.find_all("h2")
    for i in all:
        print i.string
        duzhe[i.string]=list()
        for link in i.parent.find_all("a"):
            href=url+link.get("href")
            print href
            while 1:
                try:
                    article=getEachArticle(href)
                    break
                except:
                    continue
            duzhe[i.string].append(article)
    return duzhe
def readDuZhe(duzhe):
    for eachColumn in duzhe:
        for eachArticle in duzhe[eachColumn]:
            print eachArticle["title"]
if __name__ == '__main__':
#    issue=raw_input("issue(201501):")
    readDuZhe(getCatalog("201424"))

getpdf.py

复制代码 代码如下:

#!/usr/bin/env python
#coding=utf-8
"""
    Author:         Anemone
    Filename:       writetopdf.py
    Last modified:  2015-02-20 19:19
    E-mail:         anemone@82flex.com
"""
#coding=utf-8
import reportlab.rl_config
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib import fonts
import copy
from reportlab.platypus import Paragraph, SimpleDocTemplate,flowables
from reportlab.lib.styles import getSampleStyleSheet
import crawler
def writePDF(issue,duzhe):
    reportlab.rl_config.warnOnMissingFontGlyphs = 0
    pdfmetrics.registerFont(TTFont('song',"simsun.ttc"))
    pdfmetrics.registerFont(TTFont('hei',"msyh.ttc"))
    fonts.addMapping('song', 0, 0, 'song')
    fonts.addMapping('song', 0, 1, 'song')
    fonts.addMapping('song', 1, 0, 'hei')
    fonts.addMapping('song', 1, 1, 'hei')
    stylesheet=getSampleStyleSheet()
    normalStyle = copy.deepcopy(stylesheet['Normal'])
    normalStyle.fontName ='song'
    normalStyle.fontSize = 11
    normalStyle.leading = 11
    normalStyle.firstLineIndent = 20
    titleStyle = copy.deepcopy(stylesheet['Normal'])
    titleStyle.fontName ='song'
    titleStyle.fontSize = 15
    titleStyle.leading = 20
    firstTitleStyle = copy.deepcopy(stylesheet['Normal'])
    firstTitleStyle.fontName ='song'
    firstTitleStyle.fontSize = 20
    firstTitleStyle.leading = 20
    firstTitleStyle.firstLineIndent = 50
    smallStyle = copy.deepcopy(stylesheet['Normal'])
    smallStyle.fontName ='song'
    smallStyle.fontSize = 8
    smallStyle.leading = 8
    story = []
    story.append(Paragraph("读者{0}期".format(issue), firstTitleStyle))
    for eachColumn in duzhe:
        story.append(Paragraph('__'*28, titleStyle))
        story.append(Paragraph('{0}'.format(eachColumn), titleStyle))
        for eachArticle in duzhe[eachColumn]:
            story.append(Paragraph(eachArticle["title"],normalStyle))
    story.append(flowables.PageBreak())
    for eachColumn in duzhe:
        for eachArticle in duzhe[eachColumn]:
            story.append(Paragraph("{0}".format(eachArticle["title"]),titleStyle))
            story.append(Paragraph(" {0}  {1}".format(eachArticle["writer"],eachArticle["from"]),smallStyle))
            para=eachArticle["context"].split("  ")
            for eachPara in para:
                story.append(Paragraph(eachPara,normalStyle))
            story.append(flowables.PageBreak())
    #story.append(Paragraph("context",normalStyle))
    doc = SimpleDocTemplate("duzhe"+issue+".pdf")
    print "Writing PDF..."
    doc.build(story)
def main(issue):
    duzhe=crawler.getCatalog(issue)
    writePDF(issue,duzhe)
if __name__ == '__main__':
    issue=raw_input("Enter issue(201501):")
    main(issue)

以上就是本文的全部内容了,希望大家能够喜欢。

Statement:
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn