package com.http.client;import java.io.IOException;import org.apache.http.HttpHost;import org.apache.http.HttpResponse;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.conn.params.ConnRouteParams;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.params.CoreConnectionPNames;import org.apache.http.util.EntityUtils;import org.apache.log4j.Logger;/医院
*
* @author oo
* @date 2018-04-04 */public class MyHttpClient { private static Logger logger = Logger.getLogger(MyHttpClient.class); /医院
* 需求:使用httpclient 爬取 网站数据
*
* @param args */
public static void main(String[] args) { // 创建HttpClient 对象
HttpClient hclient = new DefaultHttpClient(); // 设置响应时间 传输源码时间 代理服务器(设置代理服务器的目的是:防止爬数据被封ip)
hclient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 20000)
.setParameter(CoreConnectionPNames.SO_TIMEOUT, 20000)
.setParameter(ConnRouteParams.DEFAULT_PROXY, new HttpHost("111.155.116.237", 8123));
HttpGet hGet = new HttpGet("http://www.swordsign.com/");
String content = ""; try { // 向网站发送请求,获取网页源码
HttpResponse execute = hclient.execute(hGet); // EntityUtils工具类把网页实体转换成字符串
content = EntityUtils.toString(execute.getEntity(), "utf-8");
} catch (ClientProtocolException e) {
e.printStackTrace();
logger.error("医院医院医院医院ClientProtocolException" + e);
} catch (IOException e) {
e.printStackTrace();
logger.error("医院医院医院医院IOException" + e);
}
System.out.println(content);
}
}
使用Jsoup进行请求:
package com.http.client;import java.io.IOException;import org.apache.log4j.Logger;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class MyJsoup { private static Logger logger = Logger.getLogger(MyJsoup.class); public static void main(String[] args) { try { // 使用jsoup 发送请求
Document document = Jsoup.connect("http://www.swordsign.com/").get();// System.out.println(document);
Elements elements = document.getElementsByTag("a");
String val = elements.text();
System.out.println(val);
for (Element element : elements) {
System.out.println(element.text()+":"+element.attr("href"));
}
} catch (IOException e) {
e.printStackTrace();
logger.error("医院医院医院医院医院*IOException: 连接失败" + e);
}
}
}
HttpClient 结合Jsoup:
1 package com.http.client; 2 3 import java.io.IOException; 4 5 import org.apache.http.HttpResponse; 6 import org.apache.http.client.ClientProtocolException; 7 import org.apache.http.client.HttpClient; 8 import org.apache.http.client.methods.HttpGet; 9 import org.apache.http.impl.client.DefaultHttpClient;10 import org.apache.http.util.EntityUtils;11 import org.jsoup.Jsoup;12 import org.jsoup.nodes.Document;13 import org.jsoup.nodes.Element;14 import org.jsoup.select.Elements;15 16 public class HttpCLientAndJsoup {17 18 public static void main(String[] args) throws ClientProtocolException, IOException
{19 // 创建HttpClient对象20 HttpClient hClient = new DefaultHttpClient();21 // 爬虫URL大部分都是get请求,创建get请求对象22 HttpGet hget = new HttpGet("http://www.swordsign.com/");23 // 向网站发送请求,获取网页源码24 HttpResponse response = hClient.execute(hget);25 // EntityUtils工具类把网页实体转换成字符串26
String content = EntityUtils.toString(response.getEntity(), "utf-8");27
// Jsoup负责解析网页28 Document doc = Jsoup.parse(content);29
// 使用元素选择器选择网页内容30 Elements elements = doc.select("div.salary_con li");31 // System.out.println(elements.text());32 for (Element element : elements) {33 String text = element.text();34 System.out.println(text);35 }36 37 }38 39 }