Home > Article > Backend Development > HttpClient crawls web page source code
Package UTIL;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util .Map;
import java.util.Set;
import java.util.Map.Entry;
import java.util.zip.GZIPInputStream;
import org.apache.commons.httpclient.Header;
import org.apache .commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons .httpclient.NameValuePair;
import org.apache.commons.httpclient.SimpleHttpConnectionManager;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache .commons.httpclient.params.HttpConnectionManagerParams;
import org.apache.commons.httpclient.params.HttpMethodParams;
/ **
* @author Liuwei
* Date: December 18, 2009
*
* TODO
* HttpClient’s auxiliary class
* /
public class HttpClientHelper
{
/ **
* HttpClient’s connection timeout , read data timeout setting (unit: milliseconds)
* /
Public static final int HTTPCLIENT_CONNECTION_TIMEOUT = 30000;
Public static final int HTTPCLIENT_SO_TIMEOUT = 120000;
Public static final int HTTPMETHOD_SO_TIMEOUT = 5000;
//Let the ConnectionMan ager management Whether to close the connection when httpclientconnection
private static boolean alwaysClose = FALSE;
private static string defaultEncode = "UTF-8";
private static last DateFormat DATE_FORMAT = new SimpleDateFormat("YYYY-MM-DD HH:MM:SS ");
/ **
* Get the HttpClient connection and set the relevant parameters
*
* @return
* /
public static HttpClient's getHttpClient()
{
HttpClient client = new HttpClient (new SimpleHttpConnectionManager (alwaysClose));
HttpConnectionManagerParams managerParams = client.getHttpConnectionManager() getParams() method.
//Set the connection timeout (in milliseconds)
managerParams.setConnectionTimeout (HTTPCLIENT_CONNECTION_TIMEOUT);
//Set the read data timeout (in milliseconds)
managerParams.setSoTimeout (HTTPCLIENT_SO_TIMEOUT);
Return to the client;
}
/ * *
* Get the HttpClient connection and set the relevant parameters
*
* @parameter logonSite
* @parameter logonPort
* @parameter protocol
* @return
* /
public static HttpClient’s getHttpClient (last string logonSite, final interpretation logonPort, last string protocol)
{
HttpClient client = new HttpClient(new SimpleHttpConnectionManager(alwaysClose));
client.getHostConfiguration() setHost(logonSite, logonPort, protocol).
HttpConnectionManagerParams managerParams = client.getHttpConnectionManager() getParams() method.
//Set the connection timeout (in milliseconds)
managerParams.setConnectionTimeout (HTTPCLIENT_CONNECTION_TIMEOUT);
//Set the read data timeout (in milliseconds)
managerParams.setSoTimeout (HTTPCLIENT_SO_TIMEOUT);
Return to the client;
}
Private static List
{
List
Boolean includeUserAgent = FALSE;
if ( empty = header&& false == header.isEmpty() ! )
{
set
for (enter
{
if (false == includeUserAgent
&& "UserAgent".equals(entry.getKey()))
{
includeUserAgent = TRUE;
}
headers.add(new headers() entry.getKey(), entry.getValue()));
}
}
if (false == includeUserAgent)
{
headers.add(new headers(
"UserAgent",
"Mozilla/4.0( Compatible with; MSIE 7.0; Windows NT 5.1; GTB5; .NET CLR 1.1.4322; .NET CLR 2.0 0.50727; Alexa Toolbar; MAXTHON 2.0)"));
}
Return Header;
}
Private static NameValuePair [] getPairs(Map
{
if (null == || POSTDATA postData.isEmpty())
{
return NULL;
}
set> = entrySet postData.entrySet();
INT DATALENGTH = entrySet.size();
NameValuePair[] = new NameValuePair[DATALENGTH]
INT I = 0;
For (entry< ;String, string>entrySet)
{
double[i++] = new NameValuePair(entry.getKey(), entry.getValue());
}
return pair;
}
/ **
*Request web content information
*
* @parameter HttpClient
* @parameter reqUrl
*parameter title
* @parameter POSTDATA
*parameter encoding
* @return
* /
public static string doRequest(HttpClient HttpClient, String reqUrl,
Map
{
String htmlContent = NULL;
if (null == HttpClient )
{
Return htmlContent;
}
//Request encoding settings
encoding = (null == encoding defaultEncode: encoding);
//Header request information
List
System.out.println("[" + DATE_FORMAT.format(new Date()) + "] - doRequest - " + reqUrl);
//Post method
if (null = POSTDATA!)
{
PostMethod PostMethod = new EncodePostMethod(reqUrl, encoding);
for (head tempHeader: header)
{
postMethod.setRequestHeader(tempHeader);
}
//Post parameter setting
NameValuePair[] = PARAMS getPairs(POSTDATA ;
Other
{
GetMethod getMethod = new implementation getMethod(reqUrl);
for (head tempHeader: header)
{
getMethod.setRequestHeader(tempHeader);
}
//Extract web page content
htmlContent = executeMethod(HttpClient , getMethod, encoding, NULL);
}
Return htmlContent;
}
Private static string getWebSite(String reqUrl)
{
String website = NULL;
if (null == reqUrl || reqUrl.isEmpty( ))
{
Return to website;
}
String prefix = "HTTP://";
if (reqUrl.startsWith(prefix))
{
INT index = reqUrl.substring(prefix.length()) indexOf("/") + prefix.length();
Website = reqUrl.substring(0, index);
}
Return to website;
}
/ **
* Get the web page content by enumerating HTTPMethod
*
* @parameter HttpClient
* @parameter requestMethod
* parameter encoding
* parameter website
* @return
* /
private static string executeMethod (HttpClient HttpClient, enum HTTPMethod requestMethod, encoded string, string website)
{
String responseContent = NULL;
if (null == HttpClient)
{
return responseContent;
}
//Determine whether to request encrypted data
Boolean dataEncrypt = FALSE;
Header acceptEncoding = requestMethod.getRequestHeader(" Accept encoding");
if (! empty = acceptEncoding
. && acceptEncoding.getValue() contains("gzip"))
{
dataEncrypt = TRUE;
}
InputStream responseStream = NULL;
try
{
INT status = httpClient.executeMethod(requestMethod);
if(HttpStatus .SC_OK == status)
{
responseStream = requestMethod.getResponseBodyAsStream();
responseContent = getContentByStream(dataEncrypt new GZIPInputStream(responseStream): responseStream, encoding);
responseStream.close();
}
//Return code is 30130 2303307 When, it means that the page has been redirected, then re-request the URL of the location, which is very important when some login authorizations are used to obtain cookies. Otherwise, if (HttpStatus.SC_MOVED_PERMANENTLY == status
|| HttpStatus.SC_MOVED_TEMPORARILY == status
|| HttpStatus .SC_SEE_OTHER == status
|| HttpStatus.SC_TEMPORARY_REDIRECT == status)
{
// Read the new URL address
header = requestMethod.getResponseHeader("position");
if (! header = NULL)
{
String redirectUrl = header.getValue();
if (null = redirectUrl!
&& false == redirectUrl.isEmpty())
{
responseContent = void;
if (null == redirectUrl || redirectUrl. isEmpty())
{
redirectUrl = "/";
}
if (false == redirectUrl.startsWith("http://")
! && empty = website)
{
if (website.startsWith( "/"))
{
redirectUrl = website + redirectUrl;
}
other
{
redirectUrl = website + "/" + redirectUrl;
}
}
GetMethod redirect = new implementation getMethod( redirectUrl);
Header referrer = requestMethod.getRequestHeader("referrer");
if (null = referrer! ; + }
} //Terminal
} //End status
} catch up (Exception 5)
{
e.printStackTrace();
}Finally
{
If (requestMethod! = NULL)
{
requestMethod.releaseConnection();
}
}
Return responseContent;
}
/ **
* Read information from the stream according to the specified encoding
*
* @parameter inStream
*Parameter encoding
* @return
*Throws IOException
* /
Public static string getContentByStream(InputStream inStream, String encoding) throws IOException
{
if (null == break)
{
return NULL;
}
StringBuilder content = new StringBuilder();
//Read the stream content using the specified encoding format
BufferedReader reader = new BufferedReader(new InputStreamReader(Interstitial, encoding));
String message = NULL;
while (null = (message = reader.readLine())!)
{
content.append(message);
content.append("r n");
}
//Close the reader, Release resources
reader.close();
Return (content.toString());
}
/ **
*Internal class, inherited from PostMethod, used to specify the postal request encoding format
* /
Public static class PostMethod extended by EncodePostMethod
{
private string encoding = NULL;
public EncodePostMethod(URL String, String encoding)
{
super(URL);
this.encode = encoding;
}
@override
public String getRequestCharSet()
{
// TODO automatically generate method stub
return (this.encode);
}
}
/ **
* test
*
* @parameter ARGS
* /
public static invalid main(String[] args)
{
//System.setProperty("http.proxyHost", "165.228.128.10");
//System.setProperty("http.proxyPort", "3128");
//System.setProperty("http.proxySet","true");
String reqUrl = " http://news.39.net/jbyw/index.html ";
reqUrl = " http:// news.39.net/a/2010722/1404231.html ”;
Map
headers.put("Accept encoding" , "gzip,deflate");
HttpClient HttpClient = getHttpClient();
String htmlContent = doRequest(HttpClient, reqUrl, headers, null, "GBK");
System.out.println( htmlContent);
}
}