Home >Web Front-end >HTML Tutorial >htmlParser usage tutorial

htmlParser usage tutorial

黄舟
黄舟Original
2016-12-22 14:44:371681browse

Recently, I have been studying Lucene's full-text search. In many places, I need to parse or analyze Html content or Html pages. Lucene's own demo program also provides an Html Parser,

but it is not a pure Java solution. So I searched everywhere, I found a "HTMLParser" on the Internet.

I will post the code for full-text retrieval in Lucene in a few days. (Retrieve articles on this site, etc.).

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;
import org.htmlparser.visitors.TextExtractingVisitor;

import com.jscud.util.LogMan; //A logging class

/**
* Demonstrates the application of Html Parse.
*
* @author scud http://www.jscud.com
*/

public class ParseHtmlTest
{

public static void main(String[] args) throws Exception
{
String aFile = "e:/jscud/temp/test.htm";

String content = readTextFile(aFile, "GBK") ;

test1(content);
System.out.println("================================== ====);

test2(content);
System.out.println("============================== ======");

test3(content);
System.out.println("========================== ==========");

test4(content);
System.out.println("====================== ==============");

test5(aFile);
System.out.println("================== ==================");

//Accessing external resources is relatively slow
test5("http://www.jscud.com");
System. out.println("====================================");

}

/ **
* Read the file to analyze the content.
* filePath can also be a Url.
*
* @param resource file/Url
*/
public static void test5(String resource) throws Exception
{
Parser myParser = new Parser(resource);

//Set encoding
myParser.setEncoding("GBK");

HtmlPage visitor = new HtmlPage(myParser);

myParser.visitAllNodesWith(visitor);

String textInPage = visitor.getTitle();

System.out.println(textInPage);
}

/**
* Processed by page. For a standard Html page, this method is recommended.
*/
public static void test4(String content) throws Exception
{
Parser myParser;
myParser = Parser.createParser(content, "GBK");

HtmlPage visitor = new HtmlPage(myParser);

myParser.visitAllNodesWith(visitor) ;

String textInPage = visitor.getTitle();

System.out.println(textInPage);
}

/**
* Use Visitor mode to parse html pages.
*
* Small advantages: Translated symbols such as <>
* Disadvantages: There are many spaces and the link cannot be extracted
*
*/
public static void test3(String content) throws Exception
{
Parser myParser;
myParser = Parser.createParser(content, "GBK");

TextExtractingVisitor visitor = new TextExtractingVisitor();

myParser.visitAllNodesWith(visitor);

String textInPage = visitor.getExtractedText();

System. out.println(textInPage);
}

/**
* Get the content of normal text and links.
*
* Use filters.
*/
public static void test2(String content) throws ParserException
{
Parser myParser;
NodeList nodeList = null;

myParser = Parser.createParser( content, "GBK");

NodeFilter textFilter = new NodeClassFilter(TextNode.class);
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);

//Do not process meta for now
//NodeFilter metaFilter = new NodeClassFilter(MetaTag .class);

OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { textFilter, linkFilter });

nodeList = myParser.parse(lastFilter);

Node[] nodes = nodeList. toNodeArray();

for (int i = 0; i < nodes.length; i++)
{
Node anode = (Node) nodes[i];

String line = "";
if (anode instanceof TextNode )
{
TextNode textnode = (TextNode) anode;
//line = textnode.toPlainTextString().trim();
line = textnode.getText();
}
else if (anode instanceof LinkTag)
{
LinkTag linknode = (LinkTag) anode;

line = linknode.getLink();
//@todo Filter jsp tags: You can implement this function yourself
//line = StringFunc.replace(line, "<%.*%> ;", "");
}

if (isTrimEmpty(line))
continue;

System.out.println(line);
}
}

/**
* Parse ordinary text nodes.
*
* @param content
* @throws ParserException
*/
public static void test1(String content) throws ParserException
{
Parser myParser;
Node[] nodes = null;

myParser = Parser.createParser(content, null);

nodes = myParser.extractAllNodesThatAre(TextNode.class); //exception could be thrown here

for (int i = 0; i < nodes.length; i++)
{
TextNode textnode = (TextNode) nodes[i];
String line = textnode.toPlainTextString().trim();
if (line.equals(""))
continue;
System.out.println(line);
}

}

/**
* Read a file into a string.
*
* @param sFileName file name
* @param sEncode String
* @return file content
*/
public static String readTextFile(String sFileName, String sEncode)
{
StringBuffer sbStr = new StringBuffer();

try
{
File ff = new File(sFileName);
InputStreamReader read = new InputStreamReader(new FileInputStream(ff),
sEncode);
BufferedReader ins = new BufferedReader(read);

String dataLine = "";
while (null != (dataLine = ins.readLine()))
{
sbStr.append(dataLine);
sbStr.append("rn");
}

ins.close();
}
catch (Exception e)
{
LogMan.error("read Text File Error", e);
}

return sbStr.toString();
}

/**
* Whether the string is empty after removing left and right spaces
* @param astr String
* @return boolean
*/
public static boolean isTrimEmpty(String astr)
{
if ((null == astr) || (astr.length() == 0))
{
return true;
}
if (isBlank(astr.trim()))
{
return true;
}
return false;
}

/**
* Whether the string is empty: null or the length is 0.
* @param astr source string.
* @return boolean
*/
public static boolean isBlank(String astr)
{
if ((null == astr) || (astr.length() == 0))
{
return true;
}
else
{
return false;
}
}

}

 以上就是htmlParser使用教程的内容,更多相关内容请关注PHP中文网(www.php.cn)!


Statement:
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn