Heim >Backend-Entwicklung >C#.Net-Tutorial >HttpClient crawlt den Quellcode von Webseiten

HttpClient crawlt den Quellcode von Webseiten

巴扎黑
巴扎黑Original
2016-12-20 12:00:301791Durchsuche

Paket UTIL;

import java.io.IOException;

import java.io 🎜>import java.text.DateFormat;
import java.util.ArrayList;
import java.util.HashMap; 🎜>import java.util.List;
import java.util.Set;
import java.util.zip .GZIPInputStream;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.NameValuePair ; httpclient.SimpleHttpConnectionManager;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.params. HttpConnectionManagerParams;

import org.apache.commons.httpclient.params.HttpMethodParams;

/ **
* @author Liuwei
* Datum: 18. Dezember 2009
*
* TODO
* Hilfsklasse für HttpClient
* /
öffentliche Klasse HttpClientHelper
{

/ **
* Zeitüberschreitung der HttpClient-Verbindung, Zeitüberschreitungseinstellung für das Lesen von Daten (Einheit: Millisekunden)
* /
Öffentliche statische endgültige Interpretation HTTPCLIENT_CONNECTION_TIMEOUT = 30000;
Public static final int HTTPCLIENT_SO_TIMEOUT = 120000;
Public static final int HTTPMETHOD_SO_TIMEOUT = 5000;

//Lassen Sie den ConnectionManager verwalten, ob die Verbindung geschlossen werden soll
Private static boolean AlwaysClose = FALSE ;
private static string defaultEncode = "UTF-8"; private static last DateFormat DATE_FORMAT = new SimpleDateFormat("YYYY-MM-DD HH:MM:SS");

/ **
* Holen Sie sich die HttpClient-Verbindung und legen Sie die relevanten Parameter fest
*
* @return
* /
Public static HttpClient's getHttpClient()
{
HttpClient client = new HttpClient(new SimpleHttpConnectionManager(alwaysClose));
HttpConnectionManagerParams managerParams = client.getHttpConnectionManager() getParams() Methode.
//Legen Sie das Zeitlimit für die Verbindung fest (in Millisekunden)
managerParams.setConnectionTimeout (HTTPCLIENT_CONNECTION_TIMEOUT);
//Legen Sie das Zeitlimit für das Lesen von Daten fest (in Millisekunden)
managerParams.setSoTimeout (HTTPCLIENT_SO_TIMEOUT);
Zurück zum Client;
}

/ **
* Holen Sie sich die HttpClient-Verbindung und legen Sie die relevanten Parameter fest
*
* @parameter logonSite
* @parameter logonPort
* @parameter Protocol
* @return
* /
public static HttpClient getHttpClient(last string logonSite, last int logonPort, last string Protocol)
{
HttpClient client = new HttpClient( new SimpleHttpConnectionManager(alwaysClose));
client.getHostConfiguration() setHost(logonSite, logonPort, Protocol).
HttpConnectionManagerParams managerParams = client.getHttpConnectionManager() getParams()-Methode.
//Legen Sie das Zeitlimit für die Verbindung fest (in Millisekunden)
managerParams.setConnectionTimeout (HTTPCLIENT_CONNECTION_TIMEOUT);
//Legen Sie das Zeitlimit für das Lesen von Daten fest (in Millisekunden)
managerParams.setSoTimeout (HTTPCLIENT_SO_TIMEOUT);
return client; ;
boolean includeUserAgent = FALSE;
if (null = header&& false == header.isEmpty()!)
{
set> );
for (enter Eintrag: EntrySet)
{
if (false == includeUserAgent
&&"UserAgent".equals(entry.getKey()))
{
includeUserAgent = TRUE;
}
headers.add(new headers(entry.getKey(), enter.getValue())); >if (false == includeUserAgent)
{
headers.add(new headers(
"UserAgent" ,
"Mozilla/4.0 (Kompatibel mit; MSIE 7.0; Windows NT 5.1; GTB5; . NET CLR 1.1.4322; .NET CLR 2.0 0.50727; MAXTHON 2.0)); ;String, String> POSTDATA)
{
if (null == || POSTDATA postData.isEmpty())
{
return NULL; ;input> = EntrySet postData.entrySet(); 🎜>For (enter enter :entrySet)
{
double[i++] = new NameValuePair(entry.getKey(), enter.getValue()}
Rückgabepaar;
}

/ **
*Inhaltsinformationen der Webseite anfordern
*
* @parameter HttpClient
* @parameter reqUrl
*Parametertitel
* @parameter POSTDATA
*Parameterkodierung
* @return
*/
Öffentliche statische Zeichenfolge doRequest(HttpClient HttpClient, String reqUrl,
Map header, Map< String, String> POSTDATA, String-Kodierung)
{
String htmlContent = NULL;if (null == HttpClient)
{
return htmlContent; >//Codierungseinstellungen anfordern
Encoding = (empty == binding defaultEncode: binding);//Header-Anforderungsinformationen
List = header getHeaders (head); 🎜>System.out.println("[" + DATE_FORMAT.format(new Date()) + "] - doRequest - " + reqUrl>//Exchange method <br> if (empty = POSTDATA! ) <br>{ <br> PostMethod PostMethod = new EncodePostMethod(reqUrl,kodierung); <br> for (head tempHeader: header) <br>{ <br>postMethod.setRequestHeader(tempHeader); 🎜>//Post-Parameter-Einstellung NameValuePair [] of <br> = PARAMS getPairs(POSTDATA);if (leer = Parameter!) <br>{ <br>postMethod.setRequestBody(PARAMS); <br>} <br><br>//Webseiteninhalt extrahieren <br>htmlContent =executeMethod(HttpClient, post-method,kodierung, getWebSite(reqUrl)); neue Implementierung getMethod(reqUrl); <br>for (head tempHeader: header) <br>{ <br>getMethod.setRequestHeader(tempHeader); <br>} <br><br>//Webseiteninhalt extrahieren <br> htmlContent =executeMethod(HttpClient, getMethod,kodierung, NULL); <br>} <br>return htmlContent; <br>} <br><br>Private static String getWebSite(String reqUrl) <br>{ <br>String website = NULL; <br>if (null == reqUrl || reqUrl.isEmpty()) <br>{ <br>Return website; <br>if (reqUrl.startsWith(prefix)) <br>{ <br>INT index = reqUrl.substring(prefix.length()) indexOf("/") + prefix .length(); <br>Website = reqUrl.substring(0, index); <br>} <br>Return to website; >* @parameter HttpClient <br>* @parameter requestMethod <br>*parameterkodierung <br>*parameter website <br>* @return <br>* / <br>Privater statischer StringexecuteMethod(HttpClient HttpClient, enum HTTPMethod requestMethod, codierter String, String-Site) <br>{ <br>String ResponseContent = NULL; <br>if (null == HttpClient) <br>{ <br>Return ResponseContent; Bestimmen Sie, ob verschlüsselte Daten angefordert werden sollen <br> Boolean dataEncrypt = FALSE; <br>Header takeEncoding = requestMethod.getRequestHeader ("Accept Encoding"); leer = AcceptEncoding <br>. && AcceptEncoding.getValue() enthält("gzip")) <br>{ <br>dataEncrypt = TRUE <br>} <br><br>InputStream ResponseStream = NULL <br>try<br>{ <br> INT status = httpClient.executeMethod(requestMethod); <br>if (HttpStatus.SC_OK == status) <br>{ <br>responseStream = requestMethod.getResponseBodyAsStream(); <br>responseContent = getContentByStream(dataEncrypt new GZIPInputStream( ResponseStream) : ResponseStream, Encoding); . Dies ist bei einigen Anmeldungen nicht der Fall. Wichtig bei der Autorisierung der Verwendung von Cookies <br> Andernfalls gilt, wenn (HttpStatus.SC_MOVED_PERMANENTLY == Status <br>|| = Status <br>||. HttpStatus.SC_TEMPORARY_REDIRECT = = Status) <br>{ <br>//Neue URL-Adresse lesen <br>Header = requestMethod.getResponseHeader("position"); <br>if (! header = NULL ) <br> if (empty == RedirectUrl || RedirectUrl.isEmpty()) <br>{ <br>redirectUrl = "/"; <br>} <br><br> if (false == RedirectUrl.startsWith(" HTTP: //") <br>! && empty = website) <br>{ <br>if (website.startsWith("/")) <br>{ <br>redirectUrl = website+redirectUrl; <br>} <br>Andere<br>{ <br>redirectUrl = website + "/" + restartUrl; <br>} <br><br>GetMethod weitergeleitet = neue Implementierung getMethod(redirectUrl); = requestMethod.getRequestHeader("referrer"); <br>if (null = referrer! ) <br>{ <br>redirect.addRequestHeader(referrer); <br>} <br> header cookie = requestMethod.getRequestHeader("cookie"); <br>if (null = cookie!) <br>{ <br>redirect.addRequestHeader(cookie); <br>status = httpClient.executeMethod(redirect); <br>if (HttpStatus.SC_OK == status) <br>{ <br>responseStream = getResponseBodyAsStream(). ; <br>responseContent = getContentByStream(responseStream(); } //Endstatus <br><br>} Catch Up (Exception 5) <br>{ <br>e.printStackTrace (); <br>}Endlich <br>{ <br>if (requestMethod!= NULL) <br>{ <br>requestMethod.releaseConnection(); <br>} <br>} <br>return ResponseContent; <br><br>/ ** <br> *Informationen aus dem Stream gemäß der angegebenen Kodierung lesen <br>* <br>* @Parameter inStream <br>*Parameterkodierung <br>* @Return <br>* IOException auslösen <br>* / <br>Public static String getContentByStream(InputStream inStream, String-Kodierung) löst eine IOException aus <br>{ <br>if (null == instream) <br>{ <br>return NULL <br>} <br><br>Content of StringBuilder = new StringBuilder(); <br>//Lesen Sie den Stream-Inhalt mit dem angegebenen Codierungsformat <br> BufferedReader reader = new BufferedReader(new InputStreamReader(Interstitial, binding)); <br> String message = NULL; while (null = (message = reader.readLine())!) <br>{ <br>content.append(message); <br>content.append("r n"} <br>//Schließen Sie den Reader und Ressourcen freigeben <br>reader.close(); <br>Return (content.toString()); <br><br>/ ** <br>* Innere Klasse, geerbt von PostMethod, früher Geben Sie das Codierungsformat der Postanforderung an >{ <br>Super(URL); <br>this.encode = binding; generate method stub <br> return (this.encode); <br>} <br><br>} <br><br>/ * * <br>*Test<br>* <br>* @parameter ARGS <br>* / <br>public static void main(String[] args)<br>{ <br>//System.setProperty( "http.proxyHost", "165.228.128.10"); <br>//System. setProperty("http.proxyPort", "3128"); <br>//System.setProperty("http.proxySet", "true "); <br><br><br>String reqUrl = " http:// news.39.net/jbyw/index.html "; <br>reqUrl = " http://news.39.net/ a/2010722/1404231.html "; <br>Map<String, String> headers = neu HashMap <String, String>(); <br>headers.put("Akzeptiere die Codierung", "gzip,deflate"); HttpClient, reqUrl, headers, null, "GBK"); <br> System.out.println(htmlContent); <br><br>} <br></p></div><div class="nphpQianMsg"><div class="clear"></div></div><div class="nphpQianSheng"><span>Stellungnahme:</span><div>Der Inhalt dieses Artikels wird freiwillig von Internetnutzern beigesteuert und das Urheberrecht liegt beim ursprünglichen Autor. Diese Website übernimmt keine entsprechende rechtliche Verantwortung. Wenn Sie Inhalte finden, bei denen der Verdacht eines Plagiats oder einer Rechtsverletzung besteht, wenden Sie sich bitte an admin@php.cn</div></div></div><div class="nphpSytBox"><span>Vorheriger Artikel:<a class="dBlack" title="Zusammenfassung der Vererbung" href="https://m.php.cn/de/faq/345546.html">Zusammenfassung der Vererbung</a></span><span>Nächster Artikel:<a class="dBlack" title="Zusammenfassung der Vererbung" href="https://m.php.cn/de/faq/345553.html">Zusammenfassung der Vererbung</a></span></div><div class="nphpSytBox2"><div class="nphpZbktTitle"><h2>In Verbindung stehende Artikel</h2><em><a href="https://m.php.cn/de/article.html" class="bBlack"><i>Mehr sehen</i><b></b></a></em><div class="clear"></div></div><ins class="adsbygoogle" style="display:block" data-ad-format="fluid" data-ad-layout-key="-6t+ed+2i-1n-4w" data-ad-client="ca-pub-5902227090019525" data-ad-slot="8966999616"></ins><script> (adsbygoogle = window.adsbygoogle || []).push({}); </script><ul class="nphpXgwzList"><li><b></b><a href="https://m.php.cn/de/faq/339563.html" title=".Net Core-Grafikverifizierungscode" class="aBlack">.Net Core-Grafikverifizierungscode</a><div class="clear"></div></li><li><b></b><a href="https://m.php.cn/de/faq/339607.html" title="Laden der .NET Core-Konfigurationsdatei und DI-Injektion von Konfigurationsdaten" class="aBlack">Laden der .NET Core-Konfigurationsdatei und DI-Injektion von Konfigurationsdaten</a><div class="clear"></div></li><li><b></b><a href="https://m.php.cn/de/faq/339782.html" title="Dokumentation zum .NET Core CLI-Tool dotnet-publish" class="aBlack">Dokumentation zum .NET Core CLI-Tool dotnet-publish</a><div class="clear"></div></li><li><b></b><a href="https://m.php.cn/de/faq/345608.html" title="asp.net verwendet .net-Steuerelemente, um Dropdown-Navigationsmenüs zu erstellen" class="aBlack">asp.net verwendet .net-Steuerelemente, um Dropdown-Navigationsmenüs zu erstellen</a><div class="clear"></div></li><li><b></b><a href="https://m.php.cn/de/faq/346494.html" title="So erhalten Sie den Namen des Controllers in Asp.net MVC" class="aBlack">So erhalten Sie den Namen des Controllers in Asp.net MVC</a><div class="clear"></div></li></ul></div></div><ins class="adsbygoogle" style="display:block" data-ad-format="autorelaxed" data-ad-client="ca-pub-5902227090019525" data-ad-slot="5027754603"></ins><script> (adsbygoogle = window.adsbygoogle || []).push({}); </script><footer><div class="footer"><div class="footertop"><img src="/static/imghwm/logo.png" alt=""><p>Online-PHP-Schulung für das Gemeinwohl,Helfen Sie PHP-Lernenden, sich schnell weiterzuentwickeln!</p></div><div class="footermid"><a href="https://m.php.cn/de/about/us.html">Über uns</a><a href="https://m.php.cn/de/about/disclaimer.html">Haftungsausschluss</a><a href="https://m.php.cn/de/update/article_0_1.html">Sitemap</a></div><div class="footerbottom"><p> © php.cn All rights reserved </p></div></div></footer><script>isLogin = 0;</script><script type="text/javascript" src="/static/layui/layui.js"></script><script type="text/javascript" src="/static/js/global.js?4.9.47"></script></div><script src="https://vdse.bdstatic.com//search-video.v1.min.js"></script><link rel='stylesheet' id='_main-css' href='/static/css/viewer.min.css' type='text/css' media='all'/><script type='text/javascript' src='/static/js/viewer.min.js?1'></script><script type='text/javascript' src='/static/js/jquery-viewer.min.js'></script><script>jQuery.fn.wait = function (func, times, interval) { var _times = times || -1, //100次 _interval = interval || 20, //20毫秒每次 _self = this, _selector = this.selector, //选择器 _iIntervalID; //定时器id if( this.length ){ //如果已经获取到了,就直接执行函数 func && func.call(this); } else { _iIntervalID = setInterval(function() { if(!_times) { //是0就退出 clearInterval(_iIntervalID); } _times <= 0 || _times--; //如果是正数就 -- _self = $(_selector); //再次选择 if( _self.length ) { //判断是否取到 func && func.call(_self); clearInterval(_iIntervalID); } }, _interval); } return this; } $("table.syntaxhighlighter").wait(function() { $('table.syntaxhighlighter').append("<p class='cnblogs_code_footer'><span class='cnblogs_code_footer_icon'></span></p>"); }); $(document).on("click", ".cnblogs_code_footer",function(){ $(this).parents('table.syntaxhighlighter').css('display','inline-table');$(this).hide(); }); $('.nphpQianCont').viewer({navbar:true,title:false,toolbar:false,movable:false,viewed:function(){$('img').click(function(){$('.viewer-close').trigger('click');});}}); </script></body><!-- Matomo --><script> var _paq = window._paq = window._paq || []; /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ _paq.push(['trackPageView']); _paq.push(['enableLinkTracking']); (function() { var u="https://tongji.php.cn/"; _paq.push(['setTrackerUrl', u+'matomo.php']); _paq.push(['setSiteId', '9']); var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); })(); </script><!-- End Matomo Code --></html>