Jsoup爬虫的简单应用

package com.http.client;import java.io.IOException;import org.apache.http.HttpHost;import org.apache.http.HttpResponse;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.conn.params.ConnRouteParams;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.params.CoreConnectionPNames;import org.apache.http.util.EntityUtils;import org.apache.log4j.Logger;/医院
*
* @author oo
* @date 2018-04-04 */public class MyHttpClient { private static Logger logger = Logger.getLogger(MyHttpClient.class); /医院
* 需求：使用httpclient 爬取网站数据
*
* @param args */
public static void main(String[] args) { // 创建ＨttpClient 对象
HttpClient hclient = new DefaultHttpClient(); // 设置响应时间传输源码时间代理服务器（设置代理服务器的目的是：防止爬数据被封ip）
hclient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 20000)
.setParameter(CoreConnectionPNames.SO_TIMEOUT, 20000)
.setParameter(ConnRouteParams.DEFAULT_PROXY, new HttpHost("111.155.116.237", 8123));

HttpGet hGet = new HttpGet("http://www.swordsign.com/");
String content = ""; try { // 向网站发送请求，获取网页源码
HttpResponse execute = hclient.execute(hGet); // EntityUtils工具类把网页实体转换成字符串
content = EntityUtils.toString(execute.getEntity(), "utf-8");
} catch (ClientProtocolException e) {
e.printStackTrace();
logger.error("医院医院医院医院ClientProtocolException" + e);
} catch (IOException e) {
e.printStackTrace();
logger.error("医院医院医院医院IOException" + e);
}

System.out.println(content);
}

}

　使用Jsoup进行请求：

package com.http.client;import java.io.IOException;import org.apache.log4j.Logger;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class ＭyJsoup { private static Logger logger = Logger.getLogger(ＭyJsoup.class); public static void main(String[] args) { try { // 使用jsoup 发送请求
Document document = Jsoup.connect("http://www.swordsign.com/").get();// System.out.println(document);
Elements elements = document.getElementsByTag("a");
String val = elements.text();
System.out.println(val);
for (Element element : elements) {
System.out.println(element.text()+":"+element.attr("href"));
}
} catch (IOException e) {
e.printStackTrace();
logger.error("医院医院医院医院医院*IOException:　连接失败" + e);
}
}

}

　　HttpClient 结合Jsoup：

1 package com.http.client; 2 3 import java.io.IOException; 4 5 import org.apache.http.HttpResponse; 6 import org.apache.http.client.ClientProtocolException; 7 import org.apache.http.client.HttpClient; 8 import org.apache.http.client.methods.HttpGet; 9 import org.apache.http.impl.client.DefaultHttpClient;10 import org.apache.http.util.EntityUtils;11 import org.jsoup.Jsoup;12 import org.jsoup.nodes.Document;13 import org.jsoup.nodes.Element;14 import org.jsoup.select.Elements;15 16 public class HttpCLientAndJsoup {17 18 public static void main(String[] args) throws ClientProtocolException, IOException

{19 // 创建HttpClient对象20 HttpClient hClient = new DefaultHttpClient();21 // 爬虫URL大部分都是get请求，创建get请求对象22 HttpGet hget = new HttpGet("http://www.swordsign.com/");23 // 向网站发送请求，获取网页源码24 HttpResponse response = hClient.execute(hget);25 // EntityUtils工具类把网页实体转换成字符串26

String content = EntityUtils.toString(response.getEntity(), "utf-8");27

// Jsoup负责解析网页28 Document doc = Jsoup.parse(content);29

// 使用元素选择器选择网页内容30 Elements elements = doc.select("div.salary_con li");31 // System.out.println(elements.text());32 for (Element element : elements) {33 String text = element.text();34 System.out.println(text);35 }36 37 }38 39 }

相关文章