search
HomeWeb Front-endHTML Tutorial手写的爬虫程序,程序可以成功运行,只是效率太低,十几秒才爬一条数据,求大神指点提高效率,谢谢_html/css_WEB-ITnose

import .....
/**
 * 获取****的数据
 */
public class DoMain3 {
/**
* 根据网页url获取页面内容
*/
public String getHtmlString(String url){
String hs="";
try {
URL u = new URL(url);
HttpURLConnection conn = (HttpURLConnection)u.openConnection(); 
conn.setRequestProperty("User-Agent","MSIE 7.0");
StringBuffer HtmlString = new StringBuffer();
BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(),"utf-8"));
String line="";
while((line=br.readLine())!=null){
HtmlString.append(line+"\n");
}
hs=HtmlString.toString();
System.out.println(url);
} catch (Exception e) {
System.out.println("URL地址加载出错!!");
e.printStackTrace();
}
return hs;
}
public static void main(String rags[]){
Dao d = new Dao();
DoMain3 dm = new DoMain3();
String title="";
String section="";
String content="";
String contentTitle="";
int count=110;

String url="http://*************************" ;
if(d.createTable()){
System.out.println("建表成功!!!");
try {
//加载标题页面
Document doc = Jsoup.parse(dm.getHtmlString(url));
Element titles = doc.getElementById("maincontent");
Elements lis=titles.getElementsByTag("li");
//*********************标题****************************
for(int i=1;iElements a = lis.get(i).getElementsByTag("a");
if(a.toString().equals("")){
title=lis.get(i).text();
contentTitle=title;
String data[]={contentTitle,title,section,content,url};
if(d.pinsertData(data)){
System.out.println("第"+(i+1)+"题数据插入成功!!!");
System.out.println("*****************"+count+"*****************");
}else{
System.out.println("第"+(i+1)+"题节数据插入失败!!!");
System.out.println("*****************"+count+"*****************");
break;
}
count++;
continue;
}else{
title=a.get(0).text();
url="http://****************"+a.get(0).attr("href");
//加载章节页面
Document doc2=Jsoup.parse(dm.getHtmlString(url));
Element sections =doc2.getElementById("maincontent");
Elements ls = sections.getElementsByTag("li");
//**********************节************************
for(int j=0;jElements link = ls.get(j).getElementsByTag("a");
if(link.toString().equals("")){
section=ls.get(j).text();
contentTitle=title+" "+section;
}else{
section = link.get(0).text();
url="http:*******************"+link.get(0).attr("href");
//加载内容页面
Document doc3=Jsoup.parse(dm.getHtmlString(url));
Element contents=doc3.getElementById("maincontent");
content=contents.text();
//处理内容字符串
content=content.substring(content.indexOf("?")+"?".length());
content=content.replace("'", "''");
contentTitle=title+" "+section;
}
System.out.println("****************"+count+"******************");
System.out.println("正在读第"+(i+1)+"题"+(j+1)+"节");


//往数据库插入数据
String data[]={contentTitle,title,section,content,url};
if(d.pinsertData(data)){
System.out.println("第"+(i+1)+"题"+(j+1)+"节数据插入成功!!!");
System.out.println("*****************"+count+"*****************");
count++;
}else{
System.out.println("第"+(i+1)+"题"+(j+1)+"节数据插入失败!!!");
System.out.println("*****************"+count+"*****************");
break;
}
}//end for
}

System.out.println("第"+(i+1)+"题采集完毕");


}//end for

System.out.println("采集完毕!!");

} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

?


回复讨论(解决方案)

开多个线程跑

主要是这两句,debug的时候老是在这两句停好长时间
1.BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(),"utf-8"))

2.while((line=br.readLine())!=null){
HtmlString.append(line+"\n");
}

用jsoup吧,很简单,也很好爬

一开始就是用的jsoup  效率比这个还低,就在 Document doc = Jsoup.parse(method.getResponseBodyAsString()); 这一步就走不动了,很头疼,有人建议我用sax解析,但是sax能用来解析html吗?

多线程+提高带宽

** 
 * 获取**************的数据 
 * @author wf 
 * 
 */ 
public class DoMain5 { 


public Document getDoc(String url){ 
Document doc=null; 
try { 
doc=Jsoup.connect(url).get(); 
} catch (Exception e) { 
System.out.println("文档解析失败!!"); 
e.printStackTrace(); 

return doc; 


public static void main(String rags[]){ 
Dao d = new Dao(); 
DoMain5 dm = new DoMain5(); 

String title=""; 
String section=""; 
String content=""; 
String contentTitle=""; 
int count=630; 

String url="******************" ; 

if(d.createTable()){ 
System.out.println("建表成功!!!"); 

try { 
Document doc = dm.getDoc(url); 
System.out.println(doc); 
Element titles = doc.getElementById("maincontent"); 
Elements lis=titles.getElementsByTag("li"); 
//*********************标题**************************** 
for(int i=1;iElements a = lis.get(i).getElementsByTag("a"); 
if(a.toString().equals("")){ 
title=lis.get(i).text(); 
contentTitle=title; 

String data[]={contentTitle,title,section,content,url}; 
if(d.pinsertData(data)){ 
System.out.println("第"+(i+1)+"题数据插入成功!!!"); 
System.out.println("*****************"+count+"*****************"); 
}else{ 
System.out.println("第"+(i+1)+"题节数据插入失败!!!"); 
System.out.println("*****************"+count+"*****************"); 
break; 

count++; 
continue; 
}else{ 
title=a.get(0).text(); 

url="http:***************"+a.get(0).attr("href"); 
Document doc2=dm.getDoc(url); 
Element sections =doc2.getElementById("maincontent"); 
Elements ls = sections.getElementsByTag("li"); 
//**********************节************************ 
for(int j=507;jElements link = ls.get(j).getElementsByTag("a"); 
if(link.toString().equals("")){ 
section=ls.get(j).text(); 
contentTitle=title+" "+section; 
}else{ 
section = link.get(0).text(); 
url="http:****************"+link.get(0).attr("href"); 
Document doc3=dm.getDoc(url); 
Element contents=doc3.getElementById("maincontent"); 
content=contents.text(); 
//处理内容字符串 
content=content.substring(content.indexOf("?")+"?".length()); 
content=content.replace("'", "''"); 
contentTitle=title+" "+section; 

System.out.println("****************"+count+"******************"); 
System.out.println("正在读第"+(i+1)+"题"+(j+1)+"节"); 


String data[]={contentTitle,title,section,content,url}; 

if(d.pinsertData(data)){ 
System.out.println("第"+(i+1)+"题"+(j+1)+"节数据插入成功!!!"); 
System.out.println("*****************"+count+"*****************"); 
count++; 
}else{ 
System.out.println("第"+(i+1)+"题"+(j+1)+"节数据插入失败!!!"); 
System.out.println("*****************"+count+"*****************"); 
break; 

}//end for 


System.out.println("第"+(i+1)+"题采集完毕"); 
break; 
}//end for 

System.out.println("采集完毕!!"); 

} catch (Exception e) { 

e.printStackTrace(); 


经过各位大声指点修改后  这个程序效率有明显提高,不过现在运行起来随时随地会抛出下面两个异常,还请各位大虾指点怎么解决:

1.java.net.SocketTimeoutException: Read timed out
at java.net.SocketInputStream.socketRead0(Native Method)
at java.net.SocketInputStream.read(SocketInputStream.java:129)
at java.io.BufferedInputStream.fill(BufferedInputStream.java:218)
at java.io.BufferedInputStream.read1(BufferedInputStream.java:258)
at java.io.BufferedInputStream.read(BufferedInputStream.java:317)
at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:687)
at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:632)
at sun.net.www.protocol.http.HttpURLConnection.getInputStream
(HttpURLConnection.java:1064)
at java.net.HttpURLConnection.getResponseCode(HttpURLConnection.java:373)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:429)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:410)
at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:164)
at org.jsoup.helper.HttpConnection.get(HttpConnection.java:153)
at com.wanfang.dousact.DoMain5.getDoc(DoMain5.java:35)
at com.wanfang.dousact.DoMain5.main(DoMain5.java:61)

2.java.net.SocketTimeoutException: connect timed out 
at java.net.PlainSocketImpl.socketConnect(Native Method) 
at java.net.PlainSocketImpl.doConnect(PlainSocketImpl.java:333) 
at java.net.PlainSocketImpl.connectToAddress(PlainSocketImpl.java:195) 
at java.net.PlainSocketImpl.connect(PlainSocketImpl.java:182) 
at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:366) 
at java.net.Socket.connect(Socket.java:519) 
at sun.net.NetworkClient.doConnect(NetworkClient.java:158) 
at sun.net.www.http.HttpClient.openServer(HttpClient.java:394) 
at sun.net.www.http.HttpClient.openServer(HttpClient.java:529) 
at sun.net.www.http.HttpClient.(HttpClient.java:233) 
at sun.net.www.http.HttpClient.New(HttpClient.java:306) 
at sun.net.www.http.HttpClient.New(HttpClient.java:323) 
at sun.net.www.protocol.http.HttpURLConnection.getNewHttpClient 
(HttpURLConnection.java:852) 
at sun.net.www.protocol.http.HttpURLConnection.plainConnect 
(HttpURLConnection.java:793) 
at sun.net.www.protocol.http.HttpURLConnection.connect(HttpURLConnection.java:718) 
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:425) 
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:410) 
at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:164) 
at org.jsoup.helper.HttpConnection.get(HttpConnection.java:153) 
at com.wanfang.dousact.DoMain5.getDoc(DoMain5.java:35) 
at com.wanfang.dousact.DoMain5.main(DoMain5.java:87) 

你可以参考这个看看

  https://www.baidu.com/s?wd=jsoup%20%E5%A4%AA%E6%85%A2&rsv_spt=1&rsv_iqid=0xa4c58e5b0001928e&issp=1&f=3&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&oq=java%2520write%2520%255Ct%2520%25E6%25B2%25A1%25E7%2594%25A8&inputT=11038&rsv_t=64ecKnkhyG%2Bspt6MnEr2Ttfue0gE4iYduVY65jj1n6jePnM1gL%2FwO3GVvk4XcSPt8z5R&rsv_pq=e14f800e00019f3d&sug=jsoup%E8%A7%A3%E6%9E%90html&rsv_sug3=27&rsv_sug1=19&rsv_n=1&rsv_sug2=0&prefixsug=jsoup%2520%25E5%25A4%25AA%25E6%2585%25A2&rsp=0&rsv_sug4=12115

Statement
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn
What is the purpose of the <datalist> element?What is the purpose of the <datalist> element?Mar 21, 2025 pm 12:33 PM

The article discusses the HTML <datalist> element, which enhances forms by providing autocomplete suggestions, improving user experience and reducing errors.Character count: 159

How do I use HTML5 form validation attributes to validate user input?How do I use HTML5 form validation attributes to validate user input?Mar 17, 2025 pm 12:27 PM

The article discusses using HTML5 form validation attributes like required, pattern, min, max, and length limits to validate user input directly in the browser.

What is the purpose of the <iframe> tag? What are the security considerations when using it?What is the purpose of the <iframe> tag? What are the security considerations when using it?Mar 20, 2025 pm 06:05 PM

The article discusses the <iframe> tag's purpose in embedding external content into webpages, its common uses, security risks, and alternatives like object tags and APIs.

What is the purpose of the <progress> element?What is the purpose of the <progress> element?Mar 21, 2025 pm 12:34 PM

The article discusses the HTML <progress> element, its purpose, styling, and differences from the <meter> element. The main focus is on using <progress> for task completion and <meter> for stati

What is the purpose of the <meter> element?What is the purpose of the <meter> element?Mar 21, 2025 pm 12:35 PM

The article discusses the HTML <meter> element, used for displaying scalar or fractional values within a range, and its common applications in web development. It differentiates <meter> from <progress> and ex

What are the best practices for cross-browser compatibility in HTML5?What are the best practices for cross-browser compatibility in HTML5?Mar 17, 2025 pm 12:20 PM

Article discusses best practices for ensuring HTML5 cross-browser compatibility, focusing on feature detection, progressive enhancement, and testing methods.

What is the viewport meta tag? Why is it important for responsive design?What is the viewport meta tag? Why is it important for responsive design?Mar 20, 2025 pm 05:56 PM

The article discusses the viewport meta tag, essential for responsive web design on mobile devices. It explains how proper use ensures optimal content scaling and user interaction, while misuse can lead to design and accessibility issues.

How do I use the HTML5 <time> element to represent dates and times semantically?How do I use the HTML5 <time> element to represent dates and times semantically?Mar 12, 2025 pm 04:05 PM

This article explains the HTML5 <time> element for semantic date/time representation. It emphasizes the importance of the datetime attribute for machine readability (ISO 8601 format) alongside human-readable text, boosting accessibilit

See all articles

Hot AI Tools

Undresser.AI Undress

Undresser.AI Undress

AI-powered app for creating realistic nude photos

AI Clothes Remover

AI Clothes Remover

Online AI tool for removing clothes from photos.

Undress AI Tool

Undress AI Tool

Undress images for free

Clothoff.io

Clothoff.io

AI clothes remover

AI Hentai Generator

AI Hentai Generator

Generate AI Hentai for free.

Hot Article

R.E.P.O. Energy Crystals Explained and What They Do (Yellow Crystal)
2 weeks agoBy尊渡假赌尊渡假赌尊渡假赌
R.E.P.O. Best Graphic Settings
2 weeks agoBy尊渡假赌尊渡假赌尊渡假赌
R.E.P.O. How to Fix Audio if You Can't Hear Anyone
2 weeks agoBy尊渡假赌尊渡假赌尊渡假赌

Hot Tools

Dreamweaver Mac version

Dreamweaver Mac version

Visual web development tools

SublimeText3 Chinese version

SublimeText3 Chinese version

Chinese version, very easy to use

SAP NetWeaver Server Adapter for Eclipse

SAP NetWeaver Server Adapter for Eclipse

Integrate Eclipse with SAP NetWeaver application server.

Safe Exam Browser

Safe Exam Browser

Safe Exam Browser is a secure browser environment for taking online exams securely. This software turns any computer into a secure workstation. It controls access to any utility and prevents students from using unauthorized resources.

VSCode Windows 64-bit Download

VSCode Windows 64-bit Download

A free and powerful IDE editor launched by Microsoft