Home  >  Article  >  Web Front-end  >  Use Jsoup to capture page data

Use Jsoup to capture page data

PHP中文网
PHP中文网Original
2017-04-01 16:39:551450browse

You need to use the jsoup-1.7.3.jar package. If you need to see the document, I will download it. Please take a step to the official website

Here I will post the test of the Java project I used. Code

package com.javen.Jsoup;

import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class JsoupTest {
    static String url="http://www.cnblogs.com/zyw-205520/archive/2012/12/20/2826402.html";
    /**
     * @param args
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {
        
        // TODO Auto-generated method stub
        BolgBody();
        //test();
        //Blog();
        /*
         * Document doc = Jsoup.connect("http://www.oschina.net/")
         * .data("query", "Java") // 请求参数 .userAgent("I ’ m jsoup") // 设置
         * User-Agent .cookie("auth", "token") // 设置 cookie .timeout(3000) //
         * 设置连接超时时间 .post();
         */// 使用 POST 方法访问 URL

        /*
         * // 从文件中加载 HTML 文档 File input = new File("D:/test.html"); Document doc
         * = Jsoup.parse(input,"UTF-8","http://www.oschina.net/");
         */
    }

    /**
     * 获取指定HTML 文档指定的body
     * @throws IOException
     */
    private static void BolgBody() throws IOException {
        // 直接从字符串中输入 HTML 文档
        String html = "<html><head><title> 开源中国社区 </title></head>"
                + "<body><p> 这里是 jsoup 项目的相关文章 </p></body></html>";
        Document doc = Jsoup.parse(html);
        System.out.println(doc.body());
        
        
        // 从 URL 直接加载 HTML 文档
        Document doc2 = Jsoup.connect(url).get();
        String title = doc2.body().toString();
        System.out.println(title);
    }

    /**
     * 获取博客上的文章标题和链接
     */
    public static void article() {
        Document doc;
        try {
            doc = Jsoup.connect("http://www.cnblogs.com/zyw-205520/").get();
            Elements ListDiv = doc.getElementsByAttributeValue("class","postTitle");
            for (Element element :ListDiv) {
                Elements links = element.getElementsByTag("a");
                for (Element link : links) {
                    String linkHref = link.attr("href");
                    String linkText = link.text().trim();
                    System.out.println(linkHref);
                    System.out.println(linkText);
                }
            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

    }
    /**
     * 获取指定博客文章的内容
     */
    public static void Blog() {
        Document doc;
        try {
            doc = Jsoup.connect("http://www.cnblogs.com/zyw-205520/archive/2012/12/20/2826402.html").get();
            Elements ListDiv = doc.getElementsByAttributeValue("class","postBody");
            for (Element element :ListDiv) {
                System.out.println(element.html());
            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        
    }

}

The following is an introduction to the use of Jsoup in android to asynchronously parse web page data. Please note: It is easy to encounter a garbled stable

configuration here. File : Add permissions to AndroidManifest.xml

<uses-permission android:name="android.permission.INTERNET"></uses-permission>

Layout file

<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
    xmlns:tools="http://schemas.android.com/tools"
    android:layout_width="match_parent"
    android:layout_height="match_parent"
    android:orientation="vertical" >

    <WebView
        android:id="@+id/webView"
        android:layout_width="fill_parent"
        android:layout_height="200dp" />

    <ScrollView
        android:layout_width="wrap_content"
        android:layout_height="wrap_content" >

        <TextView
            android:id="@+id/textView"
            android:layout_width="wrap_content"
            android:layout_height="wrap_content"
            android:text="@string/hello_world" />
    </ScrollView>

</LinearLayout>

The code that mainly loads data asynchronously

package com.javen.aaa;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import android.app.Activity;
import android.app.Dialog;
import android.app.ProgressDialog;
import android.os.AsyncTask;
import android.os.Bundle;
import android.util.Log;
import android.webkit.WebView;
import android.widget.TextView;

public class MainActivity extends Activity {
    private WebView webView;
    private TextView textView;
    private static final int DIALOG_KEY = 0;
    @Override
    protected void onCreate(Bundle savedInstanceState) {
        super.onCreate(savedInstanceState);
        setContentView(R.layout.main);
        webView = (WebView) findViewById(R.id.webView);
        textView=(TextView) findViewById(R.id.textView);
        try {
            ProgressAsyncTask asyncTask=new ProgressAsyncTask(webView,textView);
            asyncTask.execute(10000);
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
    
    public  String test() {
        StringBuffer buffer=new StringBuffer();
        Document doc;
        try {
            doc = Jsoup.connect("http://www.cnblogs.com/zyw-205520/").get();
            Elements ListDiv = doc.getElementsByAttributeValue("class","postTitle");
            for (Element element :ListDiv) {
                Elements links = element.getElementsByTag("a");
                for (Element link : links) {
                    String linkHref = link.attr("href");
                    String linkText = link.text().trim();
                    buffer.append("linkHref=="+linkHref);
                    buffer.append("linkText=="+linkText);
                    
                    System.out.println(linkHref);
                    System.out.println(linkText);
                }
            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return buffer.toString();

    }

        // 弹出"查看"对话框
        @Override
        protected Dialog onCreateDialog(int id) {
            switch (id) {
            case DIALOG_KEY: {
                ProgressDialog dialog = new ProgressDialog(this);
                dialog.setMessage("获取数据中  请稍候...");
                dialog.setIndeterminate(true);
                dialog.setCancelable(true);
                return dialog;
            }
            }
            return null;
        }
        
        public static String readHtml(String myurl) {
            StringBuffer sb = new StringBuffer("");
            URL url;
            try {
                url = new URL(myurl);
                BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream(), "gbk"));
                String s = "";
                while ((s = br.readLine()) != null) {
                    sb.append(s + "\r\n");
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
            return sb.toString();
        }
    
    class ProgressAsyncTask extends AsyncTask<Integer, Integer, String> {

        private WebView webView;
        private TextView textView;
        public ProgressAsyncTask(WebView webView,TextView textView) {
            super();
            this.webView=webView;
            this.textView=textView;
        }

        /**
         * 这里的Integer参数对应AsyncTask中的第一个参数 这里的String返回值对应AsyncTask的第三个参数
         * 该方法并不运行在UI线程当中,主要用于异步操作,所有在该方法中不能对UI当中的空间进行设置和修改
         * 但是可以调用publish Progress方法触发onProgressUpdate对UI进行操作
         */
        @Override
        protected String doInBackground(Integer... params) {
            String str =null;
            Document doc = null;
            try {
//                String url ="http://www.cnblogs.com/zyw-205520/p/3355681.html";
//                
//                doc= Jsoup.parse(new URL(url).openStream(),"utf-8", url);
//                //doc = Jsoup.parse(readHtml(url));
//                //doc=Jsoup.connect(url).get();
//                str=doc.body().toString();
                doc = Jsoup.connect("http://www.cnblogs.com/zyw-205520/archive/2012/12/20/2826402.html").get();
                Elements ListDiv = doc.getElementsByAttributeValue("class","postBody");
                for (Element element :ListDiv) {
                    str=element.html();
                    System.out.println(element.html());
                }
                Log.d("doInBackground", str.toString());
                System.out.println(str);
                //你可以试试GBK或UTF-8
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            return str.toString() ;
            //return test();
        }

        /**
         * 这里的String参数对应AsyncTask中的第三个参数(也就是接收doInBackground的返回值)
         * 在doInBackground方法执行结束之后在运行,并且运行在UI线程当中 可以对UI空间进行设置
         */
        @Override
        protected void onPostExecute(String result) {
            webView.loadData(result, "text/html;charset=utf-8", null);
            textView.setText(result);
            removeDialog(DIALOG_KEY);
        }

        // 该方法运行在UI线程当中,并且运行在UI线程当中 可以对UI空间进行设置
        @Override
        protected void onPreExecute() {
            showDialog(DIALOG_KEY);
        }

        /**
         * 这里的Intege参数对应AsyncTask中的第二个参数
         * 在doInBackground方法当中,,每次调用publishProgress方法都会触发onProgressUpdate执行
         * onProgressUpdate是在UI线程中执行,所有可以对UI空间进行操作
         */
        @Override
        protected void onProgressUpdate(Integer... values) {
            
        }
    }

}

The above is Use Jsoup to capture the data content of the page. For more related content, please pay attention to the PHP Chinese website (www.php.cn)!

Statement:
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn