Heim > Artikel > Backend-Entwicklung > Beispiele zum Lesen und Schreiben von XML-Dateien
Dies ist eine Implementierung zum Lesen und Schreiben von XML-Dateien, die in dem Projekt verwendet werden muss, an dem gerade gearbeitet wird. Merken Sie es sich zum späteren Nachschlagen und für Schüler, die etwas daraus lernen möchten.
Klasse zum Lesen und Schreiben von XML-Dateien:
import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import org.jdom.Document; import org.jdom.Element; import org.jdom.JDOMException; import org.jdom.input.SAXBuilder; import org.jdom.output.Format; import org.jdom.output.XMLOutputter; import org.lt.cj.config.entities.ConfigModel; import org.lt.cj.config.entities.TMallConfigModel; import org.lt.cj.core.Seed; public class XMLConfigWriter { /*创建淘宝商城的配置文件*/ public Document buildUpMallDocument(TMallConfigModel missionConfig) throws MissionConfigException, EnterUrlsException { if (missionConfig == null) { throw new MissionConfigException(); } else if (missionConfig.getSeeds().isEmpty()) { return null; } // Create the root element Element rootElement = new Element("website"); /* 设置网站属性 */ /* 设置网站名称 */ rootElement.setAttribute("name", missionConfig.getWebsiteName()); /*设置网站地址*/ rootElement.setAttribute("url", missionConfig.getWebsiteUrl()); //添加任务名称 Element taskElement = new Element("taskName"); taskElement.addContent(missionConfig.getTaskName()); rootElement.addContent(taskElement); //构造种子列表节点 Element seeds = new Element("seeds"); for (int i = 0; i < missionConfig.getSeeds().size(); i++) { Element seedElement = new Element("seed"); Element seedNameElement = new Element("seedName"); seedNameElement.addContent(missionConfig.getSeeds().get(i).getSeedName()); Element seedUrlElement = new Element("seedUrl"); seedUrlElement.addContent(missionConfig.getSeeds().get(i).getUrl()); Element seedSortNameElement = new Element("sortName"); seedSortNameElement.addContent(missionConfig.getSeeds().get(i).getSortName()); seedElement.addContent(seedSortNameElement); seedElement.addContent(seedNameElement); seedElement.addContent(seedUrlElement); seeds.addContent(seedElement); } rootElement.addContent(seeds); //定义匹配的要采集的URL链接fitUrl的节点 Element fiturls = new Element("fitUrls"); for (int i = 0; i < missionConfig.getFitUrlRegs().size(); i++) { Element fitUrl = new Element("fit_url"); fitUrl.addContent(missionConfig.getFitUrlRegs().get(i)); fiturls.addContent(fitUrl); } rootElement.addContent(fiturls);//添加到根节点 //并发工作线程数 Element workingThreadsElement = new Element("workingThreads"); workingThreadsElement.addContent("" + missionConfig.getWorkingThreads()); rootElement.addContent(workingThreadsElement);//添加到根节点 //定义页面编码节点 Element pageEncodingElement = new Element("pageEncoding"); pageEncodingElement.addContent(missionConfig.getPageEncoding()); rootElement.addContent(pageEncodingElement);//添加到根节点 //定义下载图片控制标志节点 Element dwdPhoFlagElement = new Element("dwdPhoFlag"); dwdPhoFlagElement.addContent(missionConfig.getDwdPhoFlag()); rootElement.addContent(dwdPhoFlagElement); //定义原语言节点 Element oriLan = new Element("orien_lan"); oriLan.addContent(missionConfig.getOrigLanguage()); Element transLan = new Element("trans_lan"); transLan.addContent(missionConfig.getTranLanguage()); rootElement.addContent(oriLan);//添加到根节点 rootElement.addContent(transLan);//添加到根节点 //定义匹配抓取信息的产品页面Url节点 Element pageUrlRegs = new Element("pageUrlRegs"); for (int i = 0; i < missionConfig.getPageReg().size(); i++) { Element pageUrl = new Element("pageUrl"); pageUrl.addContent(missionConfig.getFitUrlRegs().get(i)); pageUrlRegs.addContent(pageUrl); } rootElement.addContent(pageUrlRegs);//添加到根节点 Map<String, List<String>> map = missionConfig.getEntityReg(); List<String> list = null; Element pathElements = new Element("pathElements"); //直接循环算啦 //===================================== Iterator iter = map.entrySet().iterator(); while (iter.hasNext()) { Map.Entry e = (Map.Entry) iter.next(); Element element = new Element(e.getKey() + ""); map = missionConfig.getEntityReg(); list = map.get(e.getKey() + ""); for (int i = 0; i < list.size(); i++) { Element path = new Element("path"); path.addContent(list.get(i)); element.addContent(path); } pathElements.addContent(element); } rootElement.addContent(pathElements); /* ===================================================== */ Document myDocument = new Document(rootElement); return myDocument; } /* 创建文档文件 */ public void createConfigFile(Document document, String filepath) { try { /* 定义XML输出器 */ XMLOutputter xmlOutPutter = new XMLOutputter(); xmlOutPutter.setFormat(Format.getPrettyFormat()); File file = new File(filepath); if (!file.exists()) { if (file.createNewFile()) { FileOutputStream fileOutputStream = new FileOutputStream(filepath); xmlOutPutter.output(document, fileOutputStream); return; } } FileOutputStream fileOutputStream = new FileOutputStream(filepath); xmlOutPutter.output(document, fileOutputStream); } catch (java.io.IOException e) { e.printStackTrace(); } } /* 重写文件 */ public void saveTask(String filePath, ConfigModel configModel) { try { TMallConfigModel tMallConfigModel = (TMallConfigModel) configModel; Document document = buildUpMallDocument(tMallConfigModel); if (document != null) { createConfigFile(document, filePath); } } catch (MissionConfigException ex) { Logger.getLogger(XMLConfigWriter.class.getName()).log(Level.SEVERE, null, ex); } catch (EnterUrlsException ex) { Logger.getLogger(XMLConfigWriter.class.getName()).log(Level.SEVERE, null, ex); } } //* xml文件读取方法 */ public TMallConfigModel readMallDocument(String filePath) { TMallConfigModel model = new TMallConfigModel(); SAXBuilder sb = new SAXBuilder(); try { //读取基本配置信息 Document doc = sb.build(filePath); //构造文档对象 Element root = doc.getRootElement(); //获取根元素 String websiteName = root.getAttributeValue("name"); //获取网站名称 String websiteAddr = root.getAttributeValue("url"); //获取网站地址 model.setWebsiteName(websiteName); //设置网站名称 model.setWebsiteUrl(websiteAddr); //设置网站地址 Element taskNameElement = root.getChild("taskName"); //获取任务名内容 String taskName = taskNameElement.getText(); model.setTaskName(taskName); //获取入口种子列表 List<Seed> seedList = new ArrayList(); Element seedsElement = root.getChild("seeds"); List list = seedsElement.getChildren(); for (int i = 0; i < list.size(); i++) { Element element = (Element) seedsElement.getChildren().get(i); Seed seed = new Seed(); Element seedNameElement = element.getChild("seedName"); Element seedUrlElement = element.getChild("seedUrl"); Element seedSortNameElement = element.getChild("sortName"); seed.setSeedName(seedNameElement.getTextTrim()); seed.setUrl(seedUrlElement.getTextTrim()); seed.setSortName(seedSortNameElement.getTextTrim()); Element parentSeedElement = element.getChild("parentSeed"); if (parentSeedElement != null) { Seed parentSeed = new Seed(); Element parentSeedNameElement = parentSeedElement.getChild("seedName"); Element parentSeedUrlElement = parentSeedElement.getChild("seedUrl"); Element parentSeedSortNameElement = parentSeedElement.getChild("sortName"); parentSeed.setSeedName(parentSeedNameElement.getText()); parentSeed.setUrl(parentSeedUrlElement.getTextTrim()); parentSeed.setSortName(parentSeedSortNameElement.getTextTrim()); } seedList.add(seed); } model.setSeeds(seedList); //获取匹配的要抽取的页面的特定部分内容 list = new ArrayList(); Element extractHtmlElement = root.getChild("extractHtml"); if (extractHtmlElement != null) { for (int i = 0; i < extractHtmlElement.getChildren().size(); i++) { Element element = (Element) extractHtmlElement.getChildren().get(i); list.add(element.getText()); } } model.setExtractHtmlReg(list); //获取匹配URLs list = new ArrayList(); Element fitUrlsElement = root.getChild("fitUrls"); for (int i = 0; i < fitUrlsElement.getChildren().size(); i++) { Element element = (Element) fitUrlsElement.getChildren().get(i); list.add(element.getText()); } model.setFitUrlRegs(list); //获取线程数量 Element workingThreadsElement = root.getChild("workingThreads"); String workingCount = workingThreadsElement.getText(); model.setWorkingThreads(Integer.valueOf(workingCount)); //获取解析编码 Element pageEncodingElement = root.getChild("pageEncoding"); String pageEncoding = pageEncodingElement.getText(); model.setPageEncoding(pageEncoding); //获取是否下载图片的标志 Element dwdPhoFlagElement = root.getChild("dwdPhoFlag"); String dphoFlag = dwdPhoFlagElement.getText(); model.setDwdPhoFlag(dphoFlag); //获取语言 Element orien_lanElement = root.getChild("orien_lan"); String orien = orien_lanElement.getText(); model.setOrigLanguage(orien); Element trans_lanElement = root.getChild("trans_lan"); String trans_lan = trans_lanElement.getText(); model.setTranLanguage(trans_lan); //获取URL正则匹配 Element pageUrlRegsElement = root.getChild("pageUrlRegs"); list = new ArrayList(); for (int i = 0; i < pageUrlRegsElement.getChildren().size(); i++) { Element element = (Element) pageUrlRegsElement.getChildren().get(i); list.add(element.getText()); } model.setPageReg(list); //获取余下的匹配规则 Map<String, List<String>> entityReg = new HashMap(); Element pathElements = root.getChild("pathElements"); for (int i = 0; i < pathElements.getChildren().size(); i++) { Element element = (Element) pathElements.getChildren().get(i); List<String> pathList = new ArrayList(); String mapName = element.getName(); for (int j = 0; j < element.getChildren().size(); j++) { Element childElement = (Element) element.getChildren().get(j); pathList.add(childElement.getText()); } entityReg.put(mapName, pathList); } model.setEntityReg(entityReg); } catch (JDOMException ex) { Logger.getLogger(XMLConfigWriter.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(XMLConfigWriter.class.getName()).log(Level.SEVERE, null, ex); } return model; } }
Inhalt der XML-Datei:
<?xml version="1.0" encoding="UTF-8"?> <website name="taobao_mall" url="http://www.tmall.com/?ver=2011b"> <taskName>caiji_tmall_精品男装_T恤</taskName> <seeds> <seed> <sortName>精品男装/T恤</sortName> <seedName>精品男装/T恤</seedName> <seedUrl>http://item.tmall.com/item.htm?id=9351702393</seedUrl> </seed> </seeds> <extractHtml> <path>div class="list item-view item-miniView"</path> </extractHtml> <fitUrls> <fit_url>http://www\.tmall\.com/go/act/tmall/iwanttobuy\.php.*</fit_url> <fit_url>http://list\.tmall\.com/.*</fit_url> <fit_url>http://item\.tmall\.com/item\.htm.*</fit_url> </fitUrls> <workingThreads>1</workingThreads> <pageEncoding>UTF-8</pageEncoding> <orien_lan>zh</orien_lan> <trans_lan>en</trans_lan> <pageUrlRegs> <pageUrl>http://www\.tmall\.com/go/act/tmall/iwanttobuy\.php.*</pageUrl> </pageUrlRegs> <pathElements> <commnents> <path>div class="tb-box tshop-psm tshop-psm-bdetailtabl" id="J_Detail"</path> <path>div id="reviews" class="J_DetailSection" data-reviewApi</path> </commnents> <shopAddr> <path>div class="clearfix tb-header-nav"</path> <path>div class="nav"</path> <path>a href</path> </shopAddr> <productDetail> <path>div id="attributes" class="attributes</path> <path>ul class="attributes-list</path> <path>li</path> </productDetail> <photosPath> <path>div class="tb-detail-bd tb-clear"</path> <path>div class="tb-gallery"</path> <path>div class="tb-booth tb-pic tb-s310"</path> <path>img id="J_ImgBooth" src</path> </photosPath> <category> <path>ul class="mallCrumbs-nav" id="J_crumbs"</path> <path>li class="mallCrumbs-nav-item"</path> </category> <countSold> <path>div class="tb-detail-bd tb-clear"</path> <path>ul class="tb-meta"</path> <path>li class="tb-sold-out tb-clear"</path> </countSold> <shopInfo> <path>div class="shop-intro"</path> <path>div class="extend"</path> <path>li</path> </shopInfo> <despPhos> <path>script</path> </despPhos> <thumbPhosPath> <path>div class="tb-detail-bd tb-clear"</path> <path>div class="tb-gallery"</path> <path>ul id="J_UlThumb" class="tb-thumb tb-clearfix"</path> <path>img src=</path> </thumbPhosPath> <productName> <path>div class="layout grid-s5m0 "</path> <path>div class="tb-detail-hd"</path> <path>a target="_blank" href=</path> </productName> <productPrice> <path>div class="tb-detail-bd tb-clear"</path> <path>ul class="tb-meta"</path> <path>li id="J_StrPriceModBox" class="tb-detail-price tb-clearfix"</path> </productPrice> </pathElements> </website>