搜索

首页  >  问答  >  正文

java - 使用Webmagic网页无法下载

使用webmagic进行简单的网页数据爬取时,遇到了网页无法下载的问题,不过在调试的时候,偶尔也会出现可以下载的情况,挺令人抓狂,在网上多次搜索,没有找到相关的解决办法,自己代码能力有限,还不能看懂问题所在,还请大神出手相救。
报的错误

2017-03-31 13:55:54,610 WARN [us.codecraft.webmagic.downloader.HttpClientDownloader] - download page http://www.neofactory.co.jp/product_detail/000004/ error
java.net.SocketTimeoutException: Read timed out
    at java.net.SocketInputStream.socketRead0(Native Method)
    at java.net.SocketInputStream.socketRead(Unknown Source)
    at java.net.SocketInputStream.read(Unknown Source)
    at java.net.SocketInputStream.read(Unknown Source)
    at org.apache.http.impl.io.SessionInputBufferImpl.streamRead(SessionInputBufferImpl.java:139)
    at org.apache.http.impl.io.SessionInputBufferImpl.fillBuffer(SessionInputBufferImpl.java:155)
    at org.apache.http.impl.io.SessionInputBufferImpl.readLine(SessionInputBufferImpl.java:284)
    at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:140)
    at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:57)
    at org.apache.http.impl.io.AbstractMessageParser.parse(AbstractMessageParser.java:261)
    at org.apache.http.impl.DefaultBHttpClientConnection.receiveResponseHeader(DefaultBHttpClientConnection.java:165)
    at org.apache.http.impl.conn.CPoolProxy.receiveResponseHeader(CPoolProxy.java:167)
    at org.apache.http.protocol.HttpRequestExecutor.doReceiveResponse(HttpRequestExecutor.java:272)
    at org.apache.http.protocol.HttpRequestExecutor.execute(HttpRequestExecutor.java:124)
    at org.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:271)
    at org.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:184)
    at org.apache.http.impl.execchain.RetryExec.execute(RetryExec.java:88)
    at org.apache.http.impl.execchain.RedirectExec.execute(RedirectExec.java:110)
    at org.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:184)
    at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:82)
    at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:107)
    at us.codecraft.webmagic.downloader.HttpClientDownloader.download(HttpClientDownloader.java:102)
    at us.codecraft.webmagic.Spider.processRequest(Spider.java:404)
    at us.codecraft.webmagic.Spider$1.run(Spider.java:321)
    at us.codecraft.webmagic.thread.CountableThreadPool$1.run(CountableThreadPool.java:74)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
    at java.lang.Thread.run(Unknown Source)

我的代码

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.SocketTimeoutException;
import java.util.ArrayList;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import jxl.Cell;
import jxl.Sheet;
import jxl.Workbook;
import jxl.read.biff.BiffException;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

public class GithubRepoPageProcessor implements PageProcessor {
    jxl.Workbook readwb=null;
    String[] a=new String[]{};
    Goodsdata gd=new Goodsdata(); 
    DatabaseControl dc=new DatabaseControl();
    static ArrayList<String>list=new ArrayList<String>();
    private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setCharset("Shift_JIS");
    public void process(Page page) {
        String todey_status="";
        String maker_no="";
        String oem_no="";
        String color="";
        String material="";
        String size="";
        String innerGoods="";
        String rightMor="";
        String warning="";
        String introduction="";
        String referedGoods="";
        String similiarGoods="";
        String similiarGoodscheck="";
        maker_no=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[2]//td[1]/text()").get();
        oem_no=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[2]//td[2]/text()").get();
        color=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[4]//td[1]/text()").get();
        material=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[4]//td[2]/text()").get();
        size=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[6]//td/text()").get();
        innerGoods=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[8]//td/text()").get();
        rightMor=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[10]//td/text()").get();
        warning=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[12]//td/text()").get();
        introduction=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[14]//td/text()").get();
        String todey_status_check=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[4]//tbody//tr//td").get();
        if(todey_status_check.contains("売り切れ中です。")){
            todey_status="0";
        }else{
            String[] str=null;
            str=todey_status_check.split(">");
            todey_status=RegexString(str[str.length-2],"\\d{1,2}");
        }
        String html=page.getHtml().toString();
        a=html.split("\n");
        if(page.getHtml()
                .xpath("//p[1]//p[2]//p[2]//table[6]//tbody//tr[1]//td//table//tbody//tr[1]//th")
                .match()){
            for(int i=0;i<a.length;i++){
                if(!a[i].contains("この商品の関連商品")){
                    continue;
                }else{
                    for(int j=i+1;j<a.length;j++){
                        if(a[j].contains("</table>")){
                            referedGoods=referedGoods.substring(0, referedGoods.length()-1);
                            break;
                        }else{
                            if(a[j].contains("商品番号")){
                                    String regEx="\\d{6}|\\b\\w{2,3}\\d{3,4}";
                                    referedGoods=referedGoods+"nf-"+RegexString(a[j],regEx)+":";//调用正则函数表达式函数,返回关联商品番号]
                            }
                        }

                    }
                }
            }
        }
        if (page.getHtml()
                .xpath("//p[1]//p[2]//p[2]//table[6]//tbody//tr[2]//td//table//tbody//tr[1]//th//strong")
                .match()) {
            similiarGoodscheck = page.getHtml()
                    .xpath("//p[1]//p[2]//p[2]//table[6]//tbody//tr[2]//td//table//tbody//tr[1]//th//strong/text()")
                    .get();
            for (int i = 0; i < a.length; i++) {
                if (!a[i].contains(similiarGoodscheck)) {
                    continue;
                } else {
                    for (int j = i + 1; j < a.length; j++) {
                        if (a[j].contains("</table>")) {
                            similiarGoods = similiarGoods.substring(0, similiarGoods.length() - 1);
                            break;
                        } else {
                            if (a[j].contains("商品番号")) {
                                String regEx = "\\d{6}|\\b\\w{2,3}\\d{3,4}";
                                similiarGoods = similiarGoods + "nf-" + RegexString(a[j], regEx) + ":";// 调用正则函数表达式函数,返回关联商品番号]
                            }
                        }

                    }
                }
            }
        } 
//          System.out.println(todey_status);
//        System.out.println(maker_no+"  "+oem_no+" ");
//        System.out.println(color+" "+material+" "+size+" ");
//        System.out.println(innerGoods+" "+rightMor+" "+warning+" "+introduction);
//        System.out.println(referedGoods);
//        System.out.println(similiarGoods);
        gd.setMaker_no(maker_no);
        gd.setOem_no(oem_no);
        gd.setColor(color);
        gd.setMaterial(material);
        gd.setSize(size);
        gd.setInnerGoods(innerGoods);
        gd.setRightMor(rightMor);
        gd.setWarning(warning);
        gd.setIntroduction(introduction);
        gd.setReferedGoods(referedGoods);
        gd.setSimiliarGoods(similiarGoods);
        //dc.insert(gd);

    }
    public String RegexString(String targetStr,String patternStr){//正则表达式函数,接收目标html字符串,正则表达式
        String goodsnum=null;
        Pattern pt=Pattern.compile(patternStr);
        Matcher matcher=pt.matcher(targetStr);
        boolean rs=matcher.find();
        if(rs){
            goodsnum=matcher.group();
        }
        return goodsnum;
    }
    public Site getSite() {
        return site;
    }
    public void openXls() throws BiffException, IOException{//获得excel的内容
        try {
            int column=0;
            InputStream instream=new FileInputStream("C:\\Users\\xujio\\Desktop\\itemdatabase_neo.xls");
            readwb=Workbook.getWorkbook(instream);
            Sheet readsheet =readwb.getSheet(0);
            int rsColumn=readsheet.getColumns();
            int rsRows=readsheet.getRows();
            for(int j=0;j<rsColumn;j++){
                Cell cell=readsheet.getCell(j, 0);
                if(cell.getContents().equals("管理番号")){
                    column=j;
                    break;
                }
            }
            for(int i=1;i<rsRows;i++){
                String originNum=null;
                Cell cell=readsheet.getCell(column,i);
                originNum=cell.getContents();
                String[] numGoods=originNum.split("-");
                list.add(numGoods[1]);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }finally{
            readwb.close();
        }
    }
    public static void main(String[] args) {
        int check=0;    
        String strNum=null;
        try {
            new GithubRepoPageProcessor().openXls();//读取一个.xls文件
        } catch (BiffException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        for(int i=0;i<5;i++){
            strNum=list.get(i);//获取商品代号
            String url="http://www.neofactory.co.jp/product_detail/"+list.get(i)+"/";//获取相关商品代号下的网页的地址
            Spider.create(new GithubRepoPageProcessor()).addUrl(url).thread(5).run();        
        }
    }
}
大家讲道理大家讲道理2813 天前921

全部回复(1)我来回复

  • 迷茫

    迷茫2017-04-18 10:55:07

    亲,你的异常信息里面,那个url好像本来就访问不了吧,所以404了,就爬不到数据了呀

    回复
    0
  • 取消回复