This article mainly introduces the method of converting HTML pages into word and saving them in PHP. It analyzes the functions and usage of the PHPWord tool in the form of examples. It has certain reference value. Friends in need can refer to it
The example in this article describes how PHP converts HTML pages into word and saves them. Share it with everyone for your reference, the details are as follows:
A PHP tool is used here called: PHPWord.
The principle of generating Word is to compress the specified xml into a zip package and change the suffix name to doc or docx.
So to use PHPWord, you need to install the zip.dll compression extension in your PHP environment. I wrote a demo.
Function description:
20150507 — Obtaining
tags and
- list tags in HTML
20150508 — Added the function of getting pictures in articles
20150509 — Added line spacing and filtered wrong pictures
20150514 — Added table processing and changed the code to object-oriented
20150519 — Added GD library to process network images
require_once 'PHPWord.php'; require_once 'SimpleHtmlDom.class.php'; class Word{ private $url; private $LinetextArr = array(); public $CurrentDir; public $error = array(); //错误数组 public $filename = null; public $Allowtag = "p,ol,ul,table"; /**数据统计**/ public $DownImg = 0; public $expendTime = 0; public $HttpRequestTime = 0; public $ContentLen = 0; public $HttpRequestArr = array(); public $expendmemory = 0; public function __construct($url) { $startTime = $this->_Time(); $startMemory = $this->_memory(); $this->url = $url; $UrlArr = parse_url($this->url); $this->host = $UrlArr["scheme"]."://".$UrlArr['host']; $this->CurrentDir = getcwd(); $this->LinetextArr["table"] = array(); $html = new simple_html_dom($this->url); $this->HttpRequestArr[] = $this->url; $this->HttpRequestTime++; foreach($html->find($this->Allowtag) as $key=>$value) { if($value->tag == "table") { $this->ParseTable($value,0,$this->LinetextArr["table"]); } else { $this->AnalysisHtmlDom($value); } $this->error[] = error_get_last(); } $endTime = $this->_Time(); $endMemory = $this->_memory(); $this->expendTime = round(($endTime-$startTime),2); //微秒 $this->expendmemory = round(($endMemory-$startMemory)/1000,2); //bytes $this->CreateWordDom(); } private function _Time() { return array_sum(explode(" ", microtime())); } private function _memory() { return memory_get_usage(); } /** * 解析HTML中的Table,这里考虑到多层table嵌套的情况 * @param $value HTMLDOM * @param $i 遍历层级 * **/ private function ParseTable($value,$i,$Arr) { if($value->firstChild() && in_array($value->firstChild()->tag,array("table","tbody","thead","tfoot","tr"))) { foreach($value->children as $k=>$v) { $this->ParseTable($v,$i++,$Arr); } } else { foreach($value->children as $k=>$v) { if($v->firstChild() && $v->firstChild()->tag != "table") { $Arr[$i][] = array("tag"=>$v->tag,"text"=>trim($v->plaintext)); } if(!$v->firstChild()) { $Arr[$i][] = array("tag"=>$v->tag,"text"=>trim($v->plaintext)); } } } } /** * 解析HTML里面的表情 * @param $value HTMLDOM * **/ private function AnalysisHtmlDom($value) { $tmp = array(); if($value->has_child()) { foreach($value->children as $k=>$v) { $this->AnalysisHtmlDom($v); } } else { if($value->tag == "a") { $tmp = array("tag"=>$value->tag,"href"=>$value->href,"text"=>$value->innertext); } else if($value->tag == "img") { $src = $this->unescape($value->src); $UrlArr = parse_url($src); if(!isset($UrlArr['host'])) { $src = $this->host.$value->src; $UrlArr = parse_url($src); } $src = $this->getImageFromNet($src,$UrlArr); //表示有网络图片,需要下载 if($src) { $imgsArr = $this->GD($src); $tmp = array("tag"=>$value->tag,"src"=>$src,"text"=>$value->alt,"width"=>$imgsArr['width'],"height"=>$imgsArr['height']); } } else { $tmp = array("tag"=>$value->tag,"text"=>strip_tags($value->innertext)); } $this->LinetextArr[] = $tmp; } } /** * 根据GD库来获取图片的如果太多,进行比例压缩 * **/ private function GD($src) { list($width, $height, $type, $attr) = getimagesize($src); if($width > 800 || $height > 800 ) { $width = $width/2; $height = $height/2; } return array("width"=>$width,"height"=>$height); } /** * 将Uincode编码转移回原来的字符 * **/ public function unescape($str) { $str = rawurldecode($str); preg_match_all("/(?:%u.{4})|.{4};|\d+;|.+/U",$str,$r); $ar = $r[0]; foreach($ar as $k=>$v) { if(substr($v,0,2) == "%u"){ $ar[$k] = iconv("UCS-2BE","UTF-8",pack("H4",substr($v,-4))); } elseif(substr($v,0,3) == ""){ $ar[$k] = iconv("UCS-2BE","UTF-8",pack("H4",substr($v,3,-1))); } elseif(substr($v,0,2) == ""){ $ar[$k] = iconv("UCS-2BE","UTF-8",pack("n",substr($v,2,-1))); } } return join("",$ar); } /** * 图片下载 * @param $Src 目标资源 * @param $UrlArr 目标URL对应的数组 * **/ private function getImageFromNet($Src,$UrlArr) { $file = basename($UrlArr['path']); $ext = explode('.',$file); $this->ImgDir = $this->CurrentDir."/".$UrlArr['host']; $_supportedImageTypes = array('jpg', 'jpeg', 'gif', 'png', 'bmp', 'tif', 'tiff'); if(isset($ext['1']) && in_array($ext['1'],$_supportedImageTypes)) { $file = file_get_contents($Src); $this->HttpRequestArr[] = $Src; $this->HttpRequestTime++; $this->_mkdir(); //创建目录,或者收集错误 $imgName = md5($UrlArr['path']).".".$ext['1']; file_put_contents($this->ImgDir."/".$imgName,$file); $this->DownImg++; return $UrlArr['host']."/".$imgName; } return false; } /** * 创建目录 * **/ private function _mkdir() { if(!is_dir($this->ImgDir)) { if(!mkdir($this->ImgDir,"7777")) { $this->error[] = error_get_last(); } } } /** * 构造WordDom * **/ private function CreateWordDom() { $PHPWord = new PHPWord(); $PHPWord->setDefaultFontName('宋体'); $PHPWord->setDefaultFontSize("11"); $styleTable = array('borderSize'=>6, 'borderColor'=>'006699', 'cellMargin'=>120); // New portrait section $section = $PHPWord->createSection(); $section->addText($this->Details(),array(),array('spacing'=>120)); //数据进行处理 foreach($this->LinetextArr as $key=>$lineArr) { if(isset($lineArr['tag'])) { if($lineArr['tag'] == "li") { $section->addListItem($lineArr['text'],0,"","",array('spacing'=>120)); } else if($lineArr['tag'] == "img") { $section->addImage($lineArr['src'],array('width'=>$lineArr['width'], 'height'=>$lineArr['height'], 'align'=>'center')); } else if($lineArr['tag'] == "p") { $section->addText($lineArr['text'],array(),array('spacing'=>120)); } } else if($key == "table") { $PHPWord->addTableStyle('myOwnTableStyle', $styleTable); $table = $section->addTable("myOwnTableStyle"); foreach($lineArr as $key=>$tr) { $table->addRow(); foreach($tr as $ky=>$td) { $table->addCell(2000)->addText($td['text']); } } } } $this->downFile($PHPWord); } public function Details() { $msg = "一共请求:{$this->HttpRequestTime}次,共下载的图片有{$this->DownImg}张,并且下载完成大约使用时间:{$this->expendTime}秒,整个程序执行大约消耗内存是:{$this->expendmemory}KB,"; return $msg; } public function downFile($PHPWord) { if(empty($this->filename)) { $UrlArr = parse_url($this->url); $this->filename = $UrlArr['host'].".docx"; } // Save File $objWriter = PHPWord_IOFactory::createWriter($PHPWord, 'Word2007'); $objWriter->save($this->filename); header("Pragma: public"); header("Expires: 0"); header("Cache-Control: must-revalidate, post-check=0, pre-check=0"); header("Cache-Control: public"); header("Content-Description: File Transfer"); //Use the switch-generated Content-Type header('Content-type: application/msword');//输出的类型 //Force the download $header="Content-Disposition: attachment; filename=".$this->filename.";"; header($header); @readfile($this->filename); } }
The key point of the above code does not seem to be word generation, but the use of Simplehtmldom, which is an open source HTML parser. As mentioned before, I have been looking at his code these days, and
has led to two learning directions
① Expressing
② This extended function is organized
Reaping insights from the source code:
PHP exceptions can be caught, and PHP errors can also be caught .
error_get_last() //用这个函数可以捕获页面中的PHP错误,不谢。
Summary: The above is the entire content of this article, I hope it will be helpful to everyone's study.
Related recommendations:
php-fpm example of adding service service
php-Fpm service startup script method
PHP data type conversion (character to number, number to character)
The above is the detailed content of How to convert HTML page to word using php and save it. For more information, please follow other related articles on the PHP Chinese website!

在之前的文章《实用Word技巧分享:详解怎么更改图片颜色和形状》中,我们了解了更改图片颜色和图片形状的方法。而今天我们来聊一聊word表格,讲解美化表格--自定义表格样式的方法,快来看看吧!

在之前的文章《实用Word技巧分享:聊聊你没用过的“行号”功能》中,我们了解了Word中你肯定没用过的"行号”功能。今天继续实用Word技巧分享,看看Excel表格怎么借用Word进行分栏打印,快来收藏使用吧!

在之前的文章《实用Word技巧分享:隐藏图片,提升文档浏览和编辑效率!》中,我们学习了隐藏图片的技巧,可提升文档浏览和编辑效率。下面本篇文章再给大家分享一个实用Word技巧,看看怎么让页面自动滚动,快来收藏使用吧!

在之前的文章《实用Word技巧分享:设置页码的终极方法!》中,我们学习了Word页码的设置方法。而今天我们来一起聊聊Word文本间距设置的几个技巧,快来收藏使用吧!

在之前的文章《实用Word技巧分享:表格自定义样式,美化表格!》中,我们了解了自定义表格样式的方法。而今天我们来聊一聊word脚注和尾注,介绍一下脚注和尾注的设置使用方法,快来看看吧!

在之前的文章《实用Word技巧分享:怎么跨文档快速复制样式》中,我们了解了在文档间快速复制样式的方法。今天我们聊聊Word快捷键,聊聊【F4】键快速统一图片大小,快来看看吧!

在之前的文章《实用Word技巧分享:如何一键删除所有数字》中,我们学习了Word中一键删除所有数字的方法。而今天我们来聊聊Word表格中如何自动添加编号,简单却很实用!

在之前的文章《实用Word技巧分享:图、表如何自动编号?》中,我们了解了Word排版-让图、表自动编号的方法。而今天聊聊Word精确控制页面“行数”和“字符个数”的方法,快来看看吧!


Hot AI Tools

Undresser.AI Undress
AI-powered app for creating realistic nude photos

AI Clothes Remover
Online AI tool for removing clothes from photos.

Undress AI Tool
Undress images for free

Clothoff.io
AI clothes remover

AI Hentai Generator
Generate AI Hentai for free.

Hot Article

Hot Tools

Dreamweaver Mac version
Visual web development tools

MinGW - Minimalist GNU for Windows
This project is in the process of being migrated to osdn.net/projects/mingw, you can continue to follow us there. MinGW: A native Windows port of the GNU Compiler Collection (GCC), freely distributable import libraries and header files for building native Windows applications; includes extensions to the MSVC runtime to support C99 functionality. All MinGW software can run on 64-bit Windows platforms.

MantisBT
Mantis is an easy-to-deploy web-based defect tracking tool designed to aid in product defect tracking. It requires PHP, MySQL and a web server. Check out our demo and hosting services.

Atom editor mac version download
The most popular open source editor

Notepad++7.3.1
Easy-to-use and free code editor
