Heim >Backend-Entwicklung >Python-Tutorial >Python crawlt die ICP-Einreichungsinformationen der angegebenen URL

Python crawlt die ICP-Einreichungsinformationen der angegebenen URL

高洛峰
高洛峰Original
2016-10-18 11:25:062019Durchsuche

#coding=gbk 
import os
import sys
import re
import time
import urllib2
   
def perror_and_exit(message, status = -1):
    sys.stderr.write(message + '\n')
    sys.exit(status)
   
def get_text_from_html_tag(html):
    pattern_text =  re.compile(r">.*?    return pattern_text.findall(html)[0][1:-2].strip()
   
def parse_alexa(url):
    url_alexa = "http://icp.alexa.cn/index.php?q=%s" % url
    print url_alexa
    #handle exception 
    times = 0
    while times < 5000: #等待有一定次数限制 
        try:
            alexa = urllib2.urlopen(url_alexa).read()
   
            pattern_table = re.compile(r".*?", re.DOTALL | re.MULTILINE)
            match_table = pattern_table.search(alexa)
            if not match_table:
                raise BaseException("No table in HTML")
            break
        except:
            print "try %s times:sleep %s seconds" % (times, 2**times)
            times += 1
            time.sleep(2**times)
            continue
   
    table = match_table.group()
    pattern_tr = re.compile(r".*?", re.DOTALL | re.MULTILINE)
    match_tr = pattern_tr.findall(table)
    if len(match_tr) != 2:
        perror_and_exit("table format is incorrect")
       
    icp_tr = match_tr[1]
    pattern_td = re.compile(r".*?", re.DOTALL | re.MULTILINE)
    match_td = pattern_td.findall(icp_tr)
       
    #print match_td 
    company_name = get_text_from_html_tag(match_td[1])
    company_properties = get_text_from_html_tag(match_td[2])
    company_icp = get_text_from_html_tag(match_td[3])
    company_icp = company_icp[company_icp.find(">") + 1:]
    company_website_name = get_text_from_html_tag(match_td[4])
    company_website_home_page = get_text_from_html_tag(match_td[5])
    company_website_home_page = company_website_home_page[company_website_home_page.rfind(">") + 1:]
    company_detail_url = get_text_from_html_tag(match_td[7])
    pattern_href = re.compile(r"href=\".*?\"", re.DOTALL | re.MULTILINE)
    match_href = pattern_href.findall(company_detail_url)
    if len(match_href) == 0:
        company_detail_url = ""
    else:
        company_detail_url = match_href[0][len("href=\""):-1]
    return [url, company_name, company_properties, company_icp, company_website_name, company_website_home_page, company_detail_url]
    pass
   
if __name__ == "__main__":
    fw = file("out.txt", "w")
    for url in sys.stdin:
        fw.write("\t".join(parse_alexa(url)) + "\n")
  
#coding=gbk
import os
import sys
import re
import time
import urllib2
  
def perror_and_exit(message, status = -1):
    sys.stderr.write(message + &#39;\n&#39;)
    sys.exit(status)
  
def get_text_from_html_tag(html):
    pattern_text =  re.compile(r">.*?    return pattern_text.findall(html)[0][1:-2].strip()
  
def parse_alexa(url):
    url_alexa = "http://icp.alexa.cn/index.php?q=%s" % url
    print url_alexa
    #handle exception
    times = 0
    while times < 5000: #等待有一定次数限制
        try:
            alexa = urllib2.urlopen(url_alexa).read()
  
            pattern_table = re.compile(r".*?", re.DOTALL | re.MULTILINE)
            match_table = pattern_table.search(alexa)
            if not match_table:
                raise BaseException("No table in HTML")
            break
        except:
            print "try %s times:sleep %s seconds" % (times, 2**times)
            times += 1
            time.sleep(2**times)
            continue
  
    table = match_table.group()
    pattern_tr = re.compile(r".*?", re.DOTALL | re.MULTILINE)
    match_tr = pattern_tr.findall(table)
    if len(match_tr) != 2:
        perror_and_exit("table format is incorrect")
     
    icp_tr = match_tr[1]
    pattern_td = re.compile(r".*?", re.DOTALL | re.MULTILINE)
    match_td = pattern_td.findall(icp_tr)
     
    #print match_td
    company_name = get_text_from_html_tag(match_td[1])
    company_properties = get_text_from_html_tag(match_td[2])
    company_icp = get_text_from_html_tag(match_td[3])
    company_icp = company_icp[company_icp.find(">") + 1:]
    company_website_name = get_text_from_html_tag(match_td[4])
    company_website_home_page = get_text_from_html_tag(match_td[5])
    company_website_home_page = company_website_home_page[company_website_home_page.rfind(">") + 1:]
    company_detail_url = get_text_from_html_tag(match_td[7])
    pattern_href = re.compile(r"href=\".*?\"", re.DOTALL | re.MULTILINE)
    match_href = pattern_href.findall(company_detail_url)
    if len(match_href) == 0:
        company_detail_url = ""
    else:
        company_detail_url = match_href[0][len("href=\""):-1]
    return [url, company_name, company_properties, company_icp, company_website_name, company_website_home_page, company_detail_url]
    pass
  
if __name__ == "__main__":
    fw = file("out.txt", "w")
    for url in sys.stdin:
        fw.write("\t".join(parse_alexa(url)) + "\n")[python] view plaincopyprint? time.sleep(2)
    pass
  
 time.sleep(2)
    pass

Jeder Crawl wird für 2 Sekunden in den Ruhezustand versetzt, um zu verhindern, dass die IP blockiert wird. Selbst wenn die IP im Ruhezustand ist, wird sie nach einer gewissen Zeit immer noch blockiert

Weil das so ist ein strukturierter Crawl, wenn sich das Website-Format ändert, funktioniert das Programm nicht


Stellungnahme:
Der Inhalt dieses Artikels wird freiwillig von Internetnutzern beigesteuert und das Urheberrecht liegt beim ursprünglichen Autor. Diese Website übernimmt keine entsprechende rechtliche Verantwortung. Wenn Sie Inhalte finden, bei denen der Verdacht eines Plagiats oder einer Rechtsverletzung besteht, wenden Sie sich bitte an admin@php.cn