-
- function match_links($document) {
- preg_match_all("'
]+))[^>]*>?(.*?)'isx",$document,$links);
- while(list($key,$val) = each($links[2])) {
- if(!empty($val))
- $match['link'][] = $val;
- }
- while(list($key,$val) = each ($links[3])) {
- if(!empty($val))
- $match['link'][] = $val;
- }
- while(list($key,$val) = each($ links[4])) {
- if(!empty($val))
- $match['content'][] = $val;
- }
- while(list($key,$val) = each($links[ 0])) {
- if(!empty($val))
- $match['all'][] = $val;
- }
- return $match;
- }
Copy code
Mainly regular Question, here is a multi-test regularity under asp.net
Get the link regularity of the page
-
- public string GetHref(string HtmlCode)
- {
- string MatchVale = "";
- string Reg = @"(h|H)(r|R)(e|E)(f|F) *= *('|"")?((w|\|/|.|:|-|_)+)('|""| *|>)?";
- foreach (Match m in Regex.Matches( HtmlCode, Reg))
- {
- MatchVale += (m.Value).ToLower().Replace("href=", "").Trim() + "||";
- }
- return MatchVale;
- }
Copy code
Example 2, function code for downloading remote images in content through regular expressions in PHP
A program that uses PHP regular expressions to determine the images in the content and download and save images not under this domain name.
This program is actually an important part of the "thief program".
This section of the program is just the section for downloading remote images.
-
- if (preg_match_all("/http://[^ "']+[.jpg|.gif|.jpeg|.png]+/ui",stripcslashes($content),$aliurl)) {
- $i=0; //Multiple files++
- while(list($key ,$v) = each($aliurl[0])){
- //echo $v."
";
- $filetype = pathinfo($v, PATHINFO_EXTENSION); //Get the suffix name
- $ff = @file_get_contents($v); //Get the binary file content
- if(!stripos($v,"jbxue. com")){//Determine whether it is a picture from your own website
- if (!empty($ff)){ //Perform the following operations after obtaining the file
- $dir = "upload/".date("Ymd" )."/";//Specify a new storage path
- if (!file_exists($dir)){//Determine whether the directory exists
- @mkdir($dir,511,true); //Create a multi-level directory,511 Converted to decimal it is 777 with executable permissions
- } // bbs.it-home.org
- $nfn = $dir.date("Ymdhis").$i.".".$filetype; //Build new file Name
- $nf = @fopen($nfn,"w"); //Create file
- fwrite($nf,$ff); //Write file
- fclose($nf); //Close file
- $i++; //Multiple files++
- echo "";
- $content = str_replace($v,$nfn, $content);//Replace parameters in content
- }else{//If the image cannot be obtained, replace it with the default image
- $content = str_replace($v,"/upload/201204/20120417213810742.gif", $content);//Replace the parameters in content
- }
- }
- }
- }
Copy code
Example 3, PHP downloads images to local through regular expressions.
-
-
/* - shortage: If the image path in the webpage is not an absolute path, it cannot be crawled
- */
- set_time_limit(0);//The crawling is not affected Time limit
$URL='http://pp.baidu.com/';//Any URL
get_pic($URL);< /p>
function get_pic($pic_url) {
- //Get the image binary stream
- $data=CurlGet($pic_url);
- /*Use regular expressions to get the image link*/
- $pattern_src = '/ <[img|IMG].*?src=['|"](.*?(?:[.gif|.jpg]))['|"].*?[/]?>/';
- $num = preg_match_all($pattern_src, $data, $match_src);
- $arr_src=$match_src[1];//Get the image array
- get_name($arr_src);
echo " finished!!!";
- return 0;
- }
/*Get the picture type and save it to the same directory as the file*/
- function get_name($pic_arr )
- {
- //Picture type
- $pattern_type = '/(/.(jpg|bmp|jpeg|gif|png))/';
foreach($pic_arr as $pic_item) {//Loop to get the address of each picture
- $num = preg_match_all($pattern_type, $pic_item, $match_type);
- $pic_name = get_unique().$match_type[1][0];//Change the microsecond time Click to name
- //Save the picture in the form of stream
- $write_fd = @fopen($pic_name,"wb");
- @fwrite($write_fd, CurlGet($pic_item));
- @fclose($write_fd);
- echo "[OK]..!";
- }
- return 0;
- }
//Get unique ID through microsecond time
- function get_unique(){
- list($msec, $sec ) = explode(" ",microtime());
- return $sec.intval($msec*1000000);
- }
//Catch web page content
- function CurlGet($url) {
- $url=str_replace('&','&',$url);
- $curl = curl_init();
- curl_setopt($curl, CURLOPT_URL, $url);
- curl_setopt($curl, CURLOPT_HEADER, false);
//curl_setopt($curl, CURLOPT_REFERER,$url);
- curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; SeaPort/1.2; Windows NT 5.1; SV1 ; InfoPath.2)");
- curl_setopt($curl, CURLOPT_COOKIEJAR, 'cookie.txt');
- curl_setopt($curl, CURLOPT_COOKIEFILE, 'cookie.txt');
- curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
- curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 0);
- $values = curl_exec($curl);
- curl_close($curl);
- return $values;
- }
- ?>
-
Copy code
|