Home  >  Article  >  Backend Development  >  HttpClient crawls web page source code

HttpClient crawls web page source code

巴扎黑
巴扎黑Original
2016-12-20 12:00:301745browse

Package UTIL;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util .Map;
import java.util.Set;
import java.util.Map.Entry;
import java.util.zip.GZIPInputStream;

import org.apache.commons.httpclient.Header;
import org.apache .commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons .httpclient.NameValuePair;
import org.apache.commons.httpclient.SimpleHttpConnectionManager;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache .commons.httpclient.params.HttpConnectionManagerParams;
import org.apache.commons.httpclient.params.HttpMethodParams;

/ **
* @author Liuwei
* Date: December 18, 2009
*
* TODO
* HttpClient’s auxiliary class
* /
public class HttpClientHelper
{

/ **
* HttpClient’s connection timeout , read data timeout setting (unit: milliseconds)
* /
Public static final int HTTPCLIENT_CONNECTION_TIMEOUT = 30000;
Public static final int HTTPCLIENT_SO_TIMEOUT = 120000;
Public static final int HTTPMETHOD_SO_TIMEOUT = 5000;

//Let the ConnectionMan ager management Whether to close the connection when httpclientconnection
private static boolean alwaysClose = FALSE;
private static string defaultEncode = "UTF-8";

private static last DateFormat DATE_FORMAT = new SimpleDateFormat("YYYY-MM-DD HH:MM:SS ");

/ **
* Get the HttpClient connection and set the relevant parameters
*
* @return
* /
public static HttpClient's getHttpClient()
{
HttpClient client = new HttpClient (new SimpleHttpConnectionManager (alwaysClose));
HttpConnectionManagerParams managerParams = client.getHttpConnectionManager() getParams() method.
//Set the connection timeout (in milliseconds)
managerParams.setConnectionTimeout (HTTPCLIENT_CONNECTION_TIMEOUT);
//Set the read data timeout (in milliseconds)
managerParams.setSoTimeout (HTTPCLIENT_SO_TIMEOUT);
Return to the client;
}

/ * *
* Get the HttpClient connection and set the relevant parameters
*
* @parameter logonSite
* @parameter logonPort
* @parameter protocol
* @return
* /
public static HttpClient’s getHttpClient (last string logonSite, final interpretation logonPort, last string protocol)
{
HttpClient client = new HttpClient(new SimpleHttpConnectionManager(alwaysClose));
client.getHostConfiguration() setHost(logonSite, logonPort, protocol).
HttpConnectionManagerParams managerParams = client.getHttpConnectionManager() getParams() method.
//Set the connection timeout (in milliseconds)
managerParams.setConnectionTimeout (HTTPCLIENT_CONNECTION_TIMEOUT);
//Set the read data timeout (in milliseconds)
managerParams.setSoTimeout (HTTPCLIENT_SO_TIMEOUT);
Return to the client;
}

Private static List getHeaders(MapHeaders)
{
List = ArrayList of Headers new ();
Boolean includeUserAgent = FALSE;
if ( empty = header&& false == header.isEmpty() ! )
{
set> = entrySet header.entrySet();
for (enter entry: entrySet)
{
if (false == includeUserAgent
&& "UserAgent".equals(entry.getKey()))
{
includeUserAgent = TRUE;
}
headers.add(new headers() entry.getKey(), entry.getValue()));
}
}

if (false == includeUserAgent)
{
headers.add(new headers(
"UserAgent",
"Mozilla/4.0( Compatible with; MSIE 7.0; Windows NT 5.1; GTB5; .NET CLR 1.1.4322; .NET CLR 2.0 0.50727; Alexa Toolbar; MAXTHON 2.0)"));
}
Return Header;
}

Private static NameValuePair [] getPairs(Map POSTDATA)
{
if (null == || POSTDATA postData.isEmpty())
{
return NULL;
}

set> = entrySet postData.entrySet();
INT DATALENGTH = entrySet.size();
NameValuePair[] = new NameValuePair[DATALENGTH]
INT I = 0;
For (entry< ;String, string>entrySet)
{
double[i++] = new NameValuePair(entry.getKey(), entry.getValue());
}
return pair;
}

/ **
*Request web content information
*
* @parameter HttpClient
* @parameter reqUrl
*parameter title
* @parameter POSTDATA
*parameter encoding
* @return
* /
public static string doRequest(HttpClient HttpClient, String reqUrl,
Map header, Map POSTDATA, String encoding)
{
String htmlContent = NULL;
if (null == HttpClient )
{
Return htmlContent;
}

//Request encoding settings
encoding = (null == encoding defaultEncode: encoding);

//Header request information
List = header getHeaders(header ); <br><br>System.out.println("[" + DATE_FORMAT.format(new Date()) + "] - doRequest - " + reqUrl); <br><br>//Post method <br> if (null = POSTDATA!) <br>{ <br> PostMethod PostMethod = new EncodePostMethod(reqUrl, encoding); <br>for (head tempHeader: header) <br>{ <br> postMethod.setRequestHeader(tempHeader); <br>} <br><br>//Post parameter setting <br> NameValuePair[] = PARAMS getPairs(POSTDATA ; <br>Other<br>{ <br>GetMethod getMethod = new implementation getMethod(reqUrl); <br>for (head tempHeader: header) <br>{ <br>getMethod.setRequestHeader(tempHeader); <br>} <br><br>//Extract web page content <br>htmlContent = executeMethod(HttpClient , getMethod, encoding, NULL); <br>} <br>Return htmlContent; <br>} <br><br> Private static string getWebSite(String reqUrl) <br>{ <br>String website = NULL; <br> if (null == reqUrl || reqUrl.isEmpty( )) <br>{ <br>Return to website; <br>} <br><br>String prefix = "HTTP://"; <br>if (reqUrl.startsWith(prefix)) <br>{ <br>INT index = reqUrl.substring(prefix.length()) indexOf("/") + prefix.length(); <br>Website = reqUrl.substring(0, index); <br>} <br>Return to website; <br>} <br><br>/ ** <br>* Get the web page content by enumerating HTTPMethod <br>* <br>* @parameter HttpClient <br>* @parameter requestMethod <br>* parameter encoding <br> * parameter website <br>* @return <br>* / <br> private static string executeMethod (HttpClient HttpClient, enum HTTPMethod requestMethod, encoded string, string website) <br> { <br>String responseContent = NULL; <br>if (null == HttpClient) <br>{ <br>return responseContent; <br>} <br><br>//Determine whether to request encrypted data <br> Boolean dataEncrypt = FALSE; <br>Header acceptEncoding = requestMethod.getRequestHeader(" Accept encoding"); <br>if (! empty = acceptEncoding <br>. && acceptEncoding.getValue() contains("gzip")) <br>{ <br>dataEncrypt = TRUE; <br>} <br><br>InputStream responseStream = NULL; <br>try <br>{ <br>INT status = httpClient.executeMethod(requestMethod); <br>if(HttpStatus .SC_OK == status) <br>{ <br>responseStream = requestMethod.getResponseBodyAsStream(); <br>responseContent = getContentByStream(dataEncrypt new GZIPInputStream(responseStream): responseStream, encoding); <br>responseStream.close(); <br>} <br> //Return code is 30130 2303307 When, it means that the page has been redirected, then re-request the URL of the location, which is very important when some login authorizations are used to obtain cookies. Otherwise, if (HttpStatus.SC_MOVED_PERMANENTLY == status <br>|| HttpStatus.SC_MOVED_TEMPORARILY == status <br>|| HttpStatus .SC_SEE_OTHER == status <br> || HttpStatus.SC_TEMPORARY_REDIRECT == status) <br>{ <br> // Read the new URL address <br> header = requestMethod.getResponseHeader("position"); <br> if (! header = NULL) <br>{ <br>String redirectUrl = header.getValue(); <br>if (null = redirectUrl! <br>&& false == redirectUrl.isEmpty()) <br>{ <br>responseContent = void; <br>if (null == redirectUrl || redirectUrl. isEmpty()) <br>{ <br> redirectUrl = "/"; <br>} <br><br> if (false == redirectUrl.startsWith("http://") <br>! && empty = website) <br>{ <br> if (website.startsWith( "/")) <br>{ <br> redirectUrl = website + redirectUrl; <br>} <br> other <br>{ <br> redirectUrl = website + "/" + redirectUrl; <br>} <br>} <br><br>GetMethod redirect = new implementation getMethod( redirectUrl); <br>Header referrer = requestMethod.getRequestHeader("referrer"); <br>if (null = referrer! ; + } <br><br>} //Terminal <br><br>} //End status <br><br>} catch up (Exception 5) <br>{ <br>e.printStackTrace(); <br>}Finally <br>{ <br>If (requestMethod! = NULL) <br>{ <br>requestMethod.releaseConnection(); <br>} <br>} <br>Return responseContent; <br>} <br><br>/ ** <br>* Read information from the stream according to the specified encoding <br>* <br>* @parameter inStream <br>*Parameter encoding<br>* @return <br>*Throws IOException <br>* / <br>Public static string getContentByStream(InputStream inStream, String encoding) throws IOException <br>{ <br>if (null == break) <br>{ <br> return NULL; <br>} <br><br>StringBuilder content = new StringBuilder(); <br>//Read the stream content using the specified encoding format <br>BufferedReader reader = new BufferedReader(new InputStreamReader(Interstitial, encoding)); <br>String message = NULL; <br> while (null = (message = reader.readLine())!) <br>{ <br>content.append(message); <br>content.append("r n"); <br>} <br>//Close the reader, Release resources <br>reader.close(); <br>Return (content.toString()); <br>} <br><br>/ ** <br>*Internal class, inherited from PostMethod, used to specify the postal request encoding format <br>* / <br>Public static class PostMethod extended by EncodePostMethod <br>{ <br> private string encoding = NULL; <br><br> public EncodePostMethod(URL String, String encoding) <br>{ <br> super(URL); <br>this.encode = encoding; <br>} <br><br>@override <br> public String getRequestCharSet() <br>{ <br> // TODO automatically generate method stub <br> return (this.encode); <br>} <br><br>} <br><br>/ ** <br>* test <br>* <br>* @parameter ARGS <br>* / <br> public static invalid main(String[] args) <br>{ <br>//System.setProperty("http.proxyHost", "165.228.128.10"); <br>//System.setProperty("http.proxyPort", "3128"); <br> //System.setProperty("http.proxySet","true"); <br><br><br>String reqUrl = " http://news.39.net/jbyw/index.html "; <br>reqUrl = " http:// news.39.net/a/2010722/1404231.html ”; <br>Map<String, String> headers = new HashMap <String, String>(); <br>headers.put("Accept encoding" , "gzip,deflate"); <br><br>HttpClient HttpClient = getHttpClient(); <br>String htmlContent = doRequest(HttpClient, reqUrl, headers, null, "GBK"); <br>System.out.println( htmlContent); <br><br>} <br>}</p> <p><br></p></div><div class="nphpQianMsg"><div class="clear"></div></div><div class="nphpQianSheng"><span>Statement:</span><div>The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn</div></div></div><div class="nphpSytBox"><span>Previous article:<a class="dBlack" title="Summary of inheritance" href="http://m.php.cn/faq/345546.html">Summary of inheritance</a></span><span>Next article:<a class="dBlack" title="Summary of inheritance" href="http://m.php.cn/faq/345553.html">Summary of inheritance</a></span></div><div class="nphpSytBox2"><div class="nphpZbktTitle"><h2>Related articles</h2><em><a href="http://m.php.cn/article.html" class="bBlack"><i>See more</i><b></b></a></em><div class="clear"></div></div><ul class="nphpXgwzList"><li><b></b><a href="http://m.php.cn/faq/347608.html" title="Introduction to C# 2.0 Specification (1)" class="aBlack">Introduction to C# 2.0 Specification (1)</a><div class="clear"></div></li><li><b></b><a href="http://m.php.cn/faq/347609.html" title="C# 2.0 Specification(二)" class="aBlack">C# 2.0 Specification(二)</a><div class="clear"></div></li><li><b></b><a href="http://m.php.cn/faq/347610.html" title="C# 2.0 Sepcification(3)" class="aBlack">C# 2.0 Sepcification(3)</a><div class="clear"></div></li><li><b></b><a href="http://m.php.cn/faq/347612.html" title="C# 2.0 Specification (四)" class="aBlack">C# 2.0 Specification (四)</a><div class="clear"></div></li><li><b></b><a href="http://m.php.cn/faq/347613.html" title="C#2.0 Specification (Generics 1)" class="aBlack">C#2.0 Specification (Generics 1)</a><div class="clear"></div></li></ul></div></div><footer><div class="footer"><div class="footertop"><img src="/static/imghwm/logo.png" alt=""><p>Public welfare online PHP training,Help PHP learners grow quickly!</p></div><div class="footermid"><a href="http://m.php.cn/about/us.html">About us</a><a href="http://m.php.cn/about/disclaimer.html">Disclaimer</a><a href="http://m.php.cn/update/article_0_1.html">Sitemap</a></div><div class="footerbottom"><p> © php.cn All rights reserved </p></div></div></footer><script>isLogin = 0;</script><script type="text/javascript" src="/static/layui/layui.js"></script><script type="text/javascript" src="/static/js/global.js?4.9.47"></script></div><script src="https://vdse.bdstatic.com//search-video.v1.min.js"></script><link rel='stylesheet' id='_main-css' href='/static/css/viewer.min.css' type='text/css' media='all'/><script type='text/javascript' src='/static/js/viewer.min.js?1'></script><script type='text/javascript' src='/static/js/jquery-viewer.min.js'></script><script>jQuery.fn.wait = function (func, times, interval) { var _times = times || -1, //100次 _interval = interval || 20, //20毫秒每次 _self = this, _selector = this.selector, //选择器 _iIntervalID; //定时器id if( this.length ){ //如果已经获取到了,就直接执行函数 func && func.call(this); } else { _iIntervalID = setInterval(function() { if(!_times) { //是0就退出 clearInterval(_iIntervalID); } _times <= 0 || _times--; //如果是正数就 -- _self = $(_selector); //再次选择 if( _self.length ) { //判断是否取到 func && func.call(_self); clearInterval(_iIntervalID); } }, _interval); } return this; } $("table.syntaxhighlighter").wait(function() { $('table.syntaxhighlighter').append("<p class='cnblogs_code_footer'><span class='cnblogs_code_footer_icon'></span></p>"); }); $(document).on("click", ".cnblogs_code_footer",function(){ $(this).parents('table.syntaxhighlighter').css('display','inline-table');$(this).hide(); }); $('.nphpQianCont').viewer({navbar:true,title:false,toolbar:false,movable:false,viewed:function(){$('img').click(function(){$('.viewer-close').trigger('click');});}}); </script></body></html>