recherche

Maison  >  Questions et réponses  >  le corps du texte

java - 使用Webmagic网页无法下载

使用webmagic进行简单的网页数据爬取时,遇到了网页无法下载的问题,不过在调试的时候,偶尔也会出现可以下载的情况,挺令人抓狂,在网上多次搜索,没有找到相关的解决办法,自己代码能力有限,还不能看懂问题所在,还请大神出手相救。
报的错误

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

<code>2017-03-31 13:55:54,610 WARN [us.codecraft.webmagic.downloader.HttpClientDownloader] - download page http://www.neofactory.co.jp/product_detail/000004/ error

java.net.SocketTimeoutException: Read timed out

    at java.net.SocketInputStream.socketRead0(Native Method)

    at java.net.SocketInputStream.socketRead(Unknown Source)

    at java.net.SocketInputStream.read(Unknown Source)

    at java.net.SocketInputStream.read(Unknown Source)

    at org.apache.http.impl.io.SessionInputBufferImpl.streamRead(SessionInputBufferImpl.java:139)

    at org.apache.http.impl.io.SessionInputBufferImpl.fillBuffer(SessionInputBufferImpl.java:155)

    at org.apache.http.impl.io.SessionInputBufferImpl.readLine(SessionInputBufferImpl.java:284)

    at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:140)

    at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:57)

    at org.apache.http.impl.io.AbstractMessageParser.parse(AbstractMessageParser.java:261)

    at org.apache.http.impl.DefaultBHttpClientConnection.receiveResponseHeader(DefaultBHttpClientConnection.java:165)

    at org.apache.http.impl.conn.CPoolProxy.receiveResponseHeader(CPoolProxy.java:167)

    at org.apache.http.protocol.HttpRequestExecutor.doReceiveResponse(HttpRequestExecutor.java:272)

    at org.apache.http.protocol.HttpRequestExecutor.execute(HttpRequestExecutor.java:124)

    at org.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:271)

    at org.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:184)

    at org.apache.http.impl.execchain.RetryExec.execute(RetryExec.java:88)

    at org.apache.http.impl.execchain.RedirectExec.execute(RedirectExec.java:110)

    at org.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:184)

    at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:82)

    at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:107)

    at us.codecraft.webmagic.downloader.HttpClientDownloader.download(HttpClientDownloader.java:102)

    at us.codecraft.webmagic.Spider.processRequest(Spider.java:404)

    at us.codecraft.webmagic.Spider$1.run(Spider.java:321)

    at us.codecraft.webmagic.thread.CountableThreadPool$1.run(CountableThreadPool.java:74)

    at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)

    at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)

    at java.lang.Thread.run(Unknown Source)</code>

我的代码

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

<code>import java.io.FileInputStream;

import java.io.IOException;

import java.io.InputStream;

import java.net.SocketTimeoutException;

import java.util.ArrayList;

import java.util.Date;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

 

import jxl.Cell;

import jxl.Sheet;

import jxl.Workbook;

import jxl.read.biff.BiffException;

import us.codecraft.webmagic.Page;

import us.codecraft.webmagic.Site;

import us.codecraft.webmagic.Spider;

import us.codecraft.webmagic.processor.PageProcessor;

 

public class GithubRepoPageProcessor implements PageProcessor {

    jxl.Workbook readwb=null;

    String[] a=new String[]{};

    Goodsdata gd=new Goodsdata();

    DatabaseControl dc=new DatabaseControl();

    static ArrayList<String>list=new ArrayList<String>();

    private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setCharset("Shift_JIS");

    public void process(Page page) {

        String todey_status="";

        String maker_no="";

        String oem_no="";

        String color="";

        String material="";

        String size="";

        String innerGoods="";

        String rightMor="";

        String warning="";

        String introduction="";

        String referedGoods="";

        String similiarGoods="";

        String similiarGoodscheck="";

        maker_no=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[2]//td[1]/text()").get();

        oem_no=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[2]//td[2]/text()").get();

        color=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[4]//td[1]/text()").get();

        material=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[4]//td[2]/text()").get();

        size=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[6]//td/text()").get();

        innerGoods=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[8]//td/text()").get();

        rightMor=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[10]//td/text()").get();

        warning=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[12]//td/text()").get();

        introduction=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[14]//td/text()").get();

        String todey_status_check=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[4]//tbody//tr//td").get();

        if(todey_status_check.contains("売り切れ中です。")){

            todey_status="0";

        }else{

            String[] str=null;

            str=todey_status_check.split(">");

            todey_status=RegexString(str[str.length-2],"\\d{1,2}");

        }

        String html=page.getHtml().toString();

        a=html.split("\n");

        if(page.getHtml()

                .xpath("//p[1]//p[2]//p[2]//table[6]//tbody//tr[1]//td//table//tbody//tr[1]//th")

                .match()){

            for(int i=0;i<a.length;i++){

                if(!a[i].contains("この商品の関連商品")){

                    continue;

                }else{

                    for(int j=i+1;j<a.length;j++){

                        if(a[j].contains("</table>")){

                            referedGoods=referedGoods.substring(0, referedGoods.length()-1);

                            break;

                        }else{

                            if(a[j].contains("商品番号")){

                                    String regEx="\\d{6}|\\b\\w{2,3}\\d{3,4}";

                                    referedGoods=referedGoods+"nf-"+RegexString(a[j],regEx)+":";//调用正则函数表达式函数,返回关联商品番号]

                            }

                        }

 

                    }

                }

            }

        }

        if (page.getHtml()

                .xpath("//p[1]//p[2]//p[2]//table[6]//tbody//tr[2]//td//table//tbody//tr[1]//th//strong")

                .match()) {

            similiarGoodscheck = page.getHtml()

                    .xpath("//p[1]//p[2]//p[2]//table[6]//tbody//tr[2]//td//table//tbody//tr[1]//th//strong/text()")

                    .get();

            for (int i = 0; i < a.length; i++) {

                if (!a[i].contains(similiarGoodscheck)) {

                    continue;

                } else {

                    for (int j = i + 1; j < a.length; j++) {

                        if (a[j].contains("</table>")) {

                            similiarGoods = similiarGoods.substring(0, similiarGoods.length() - 1);

                            break;

                        } else {

                            if (a[j].contains("商品番号")) {

                                String regEx = "\\d{6}|\\b\\w{2,3}\\d{3,4}";

                                similiarGoods = similiarGoods + "nf-" + RegexString(a[j], regEx) + ":";// 调用正则函数表达式函数,返回关联商品番号]

                            }

                        }

 

                    }

                }

            }

        }

//          System.out.println(todey_status);

//        System.out.println(maker_no+"  "+oem_no+" ");

//        System.out.println(color+" "+material+" "+size+" ");

//        System.out.println(innerGoods+" "+rightMor+" "+warning+" "+introduction);

//        System.out.println(referedGoods);

//        System.out.println(similiarGoods);

        gd.setMaker_no(maker_no);

        gd.setOem_no(oem_no);

        gd.setColor(color);

        gd.setMaterial(material);

        gd.setSize(size);

        gd.setInnerGoods(innerGoods);

        gd.setRightMor(rightMor);

        gd.setWarning(warning);

        gd.setIntroduction(introduction);

        gd.setReferedGoods(referedGoods);

        gd.setSimiliarGoods(similiarGoods);

        //dc.insert(gd);

 

    }

    public String RegexString(String targetStr,String patternStr){//正则表达式函数,接收目标html字符串,正则表达式

        String goodsnum=null;

        Pattern pt=Pattern.compile(patternStr);

        Matcher matcher=pt.matcher(targetStr);

        boolean rs=matcher.find();

        if(rs){

            goodsnum=matcher.group();

        }

        return goodsnum;

    }

    public Site getSite() {

        return site;

    }

    public void openXls() throws BiffException, IOException{//获得excel的内容

        try {

            int column=0;

            InputStream instream=new FileInputStream("C:\\Users\\xujio\\Desktop\\itemdatabase_neo.xls");

            readwb=Workbook.getWorkbook(instream);

            Sheet readsheet =readwb.getSheet(0);

            int rsColumn=readsheet.getColumns();

            int rsRows=readsheet.getRows();

            for(int j=0;j<rsColumn;j++){

                Cell cell=readsheet.getCell(j, 0);

                if(cell.getContents().equals("管理番号")){

                    column=j;

                    break;

                }

            }

            for(int i=1;i<rsRows;i++){

                String originNum=null;

                Cell cell=readsheet.getCell(column,i);

                originNum=cell.getContents();

                String[] numGoods=originNum.split("-");

                list.add(numGoods[1]);

            }

        } catch (Exception e) {

            e.printStackTrace();

        }finally{

            readwb.close();

        }

    }

    public static void main(String[] args) {

        int check=0;   

        String strNum=null;

        try {

            new GithubRepoPageProcessor().openXls();//读取一个.xls文件

        } catch (BiffException e) {

            // TODO Auto-generated catch block

            e.printStackTrace();

        } catch (IOException e) {

            // TODO Auto-generated catch block

            e.printStackTrace();

        }

        for(int i=0;i<5;i++){

            strNum=list.get(i);//获取商品代号

            String url="http://www.neofactory.co.jp/product_detail/"+list.get(i)+"/";//获取相关商品代号下的网页的地址

            Spider.create(new GithubRepoPageProcessor()).addUrl(url).thread(5).run();       

        }

    }

}</code>

大家讲道理大家讲道理2816 Il y a quelques jours925

répondre à tous(1)je répondrai

  • 迷茫

    迷茫2017-04-18 10:55:07

    Cher, dans vos informations d'exception, il semble que l'URL ne soit pas accessible en premier lieu, il s'agit donc de 404 et les données ne peuvent pas être explorées

    répondre
    0
  • Annulerrépondre