首頁 >php教程 >php手册 >curl采集 根据关键词 获取雅虎竞价排名

curl采集 根据关键词 获取雅虎竞价排名

WBOY
WBOY原創
2016-06-13 09:35:061062瀏覽

之前写过curl批处理采集数据,这里贴上完整版本,代码很简单,废话不说,上代码,新手欢迎指教!!!

代码只写到 获取到链接了,至于排名 后边数组的键不就是排名喽。。。

<span>  1</span> <?<span>php
</span><span>  2</span> <span>/*</span><span>*
</span><span>  3</span> <span> * Based on yahoo access to data
</span><span>  4</span> <span> *
</span><span>  5</span> <span> * @author chujiu <527891885@qq.com>
</span><span>  6</span> <span> * @copyright 2014.04.26 By chujiu
</span><span>  7</span> <span> * @version 0.2.1 2014.04.26
</span><span>  8</span>  <span>*/</span>
<span>  9</span> 
<span> 10</span> <span>class</span><span> DataCollectionRank {
</span><span> 11</span> 
<span> 12</span>     <span>const</span>   PAGE = 10<span>;
</span><span> 13</span>     <span>public</span>  <span>$path</span> = ''<span>;
</span><span> 14</span>     <span>public</span>  <span>$main</span> = 91<span>;
</span><span> 15</span>     
<span> 16</span>     <span>//</span><span> 添加curl句柄 返回资源</span>
<span> 17</span>     <span>private</span> <span>function</span> _gather_data(<span>$keyword</span><span>) {
</span><span> 18</span>         <span>if</span>(<span>empty</span>(<span>$keyword</span><span>)) {
</span><span> 19</span>             <span>return</span> ''<span>;
</span><span> 20</span> <span>        }
</span><span> 21</span>         <span>$chs</span> = <span>array</span>(); <span>//</span><span> 句柄</span>
<span> 22</span>         <span>$mh</span> =<span> curl_multi_init();
</span><span> 23</span>         <span>for</span>( <span>$i</span>=1; <span>$i</span><=<span>$this</span>->main; <span>$i</span>+=self::<span>PAGE ) {
</span><span> 24</span>             <span>$url</span> = 'http://search.yahoo.co.jp/search?p='.<span>urlencode</span>(<span>$keyword</span>).'&tid=top_ga1_sa&ei=UTF-8&aq=-1&oq='.<span>urlencode</span>(<span>$keyword</span>).'&pstart=1&fr=top_ga1_sa&b='.<span>$i</span><span>;
</span><span> 25</span>             <span>$ch</span> =<span> curl_init();
</span><span> 26</span>             <span>//</span><span>设置选项</span>
<span> 27</span>             curl_setopt_array(<span>$ch</span>, <span>array</span><span>(
</span><span> 28</span>                 CURLOPT_URL => <span>$url</span>,
<span> 29</span>                 CURLOPT_HEADER => <span>false</span>,
<span> 30</span>                 CURLOPT_SSL_VERIFYPEER => <span>false</span>,
<span> 31</span>                 CURLOPT_RETURNTRANSFER => <span>true</span>,
<span> 32</span>                 CURLOPT_TIMEOUT => 30,
<span> 33</span>                 CURLOPT_AUTOREFERER => <span>true</span>
<span> 34</span> <span>                )
</span><span> 35</span> <span>            );
</span><span> 36</span>             curl_multi_add_handle(<span>$mh</span>, <span>$ch</span>); <span>//</span><span> 添加批处理句柄</span>
<span> 37</span>             <span>$chs</span>['handle'][<span>$i</span>]['ch'] = <span>$ch</span><span>;
</span><span> 38</span>             <span>$chs</span>['handle'][<span>$i</span>]['url'] = <span>$url</span><span>;
</span><span> 39</span> <span>        }
</span><span> 40</span>         <span>$chs</span>['mh'] = <span>$mh</span><span>;
</span><span> 41</span>         <span>return</span> <span>$chs</span><span>;
</span><span> 42</span> <span>    }
</span><span> 43</span>     
<span> 44</span>     <span>//</span><span> 处理CURL请求</span>
<span> 45</span>     <span>public</span> <span>function</span> exec_curl_get_data(<span>$keyword</span>, <span>$path</span><span>) {
</span><span> 46</span>         <span>$error</span> = ''<span>;
</span><span> 47</span>         <span>$this</span>->path = <span>$path</span><span>;
</span><span> 48</span>         <span>$chs</span> = <span>$this</span>->_gather_data(<span>$keyword</span><span>);
</span><span> 49</span>         <span>if</span>(<span>empty</span>(<span>$chs</span>)) <span>return</span> ''<span>; 
</span><span> 50</span>          <span>//</span><span> 执行批处理句柄</span>
<span> 51</span>         <span>$active</span> = <span>null</span><span>;
</span><span> 52</span>         <span>do</span><span> {
</span><span> 53</span>            <span>$mrc</span> = curl_multi_exec(<span>$chs</span>['mh'],<span>$active</span><span>);
</span><span> 54</span>            <span>//</span><span>$info = curl_multi_info_read($chs['mh']);</span>
<span> 55</span>         } <span>while</span> (<span>$active</span> > 0<span>);
</span><span> 56</span>         <span>//</span><span> 获取数据</span>
<span> 57</span>         <span>$responses</span> = <span>array</span><span>();
</span><span> 58</span>         <span>foreach</span>(<span>$chs</span>['handle'] <span>as</span> <span>$k</span>=><span>$ch</span><span>){ 
</span><span> 59</span>             <span>if</span>(curl_error(<span>$ch</span>['ch'<span>])){
</span><span> 60</span>                 <span>$error</span> .= "\n".'error提示:'.curl_error(<span>$ch</span>['ch']).'-------URL:'.<span>$ch</span>['url'].'--------时间:'.<span>date</span>('Y-d-m H:i:s',<span>time</span>())."\n"<span>;
</span><span> 61</span>             } <span>else</span><span> {
</span><span> 62</span>                 <span>$responses</span>[<span>$k</span>]['data'] = curl_multi_getcontent( <span>$ch</span>['ch'<span>] );
</span><span> 63</span> <span>            }
</span><span> 64</span>             
<span> 65</span>             <span>//</span><span>curl_multi_info_read($mh);
</span><span> 66</span> <span>            // close current handler </span>
<span> 67</span>             curl_multi_remove_handle(<span>$chs</span>['mh'], <span>$ch</span>['ch'<span>]); 
</span><span> 68</span>             curl_close(<span>$ch</span>['ch'<span>]);
</span><span> 69</span> <span>        }
</span><span> 70</span>         <span>//</span><span>关闭curl 批处理</span>
<span> 71</span>         curl_multi_close(<span>$chs</span>['mh'<span>]);
</span><span> 72</span>         <span>$str</span> = ''<span>;
</span><span> 73</span>         <span>if</span>(<span>$error</span> != ''<span>) {
</span><span> 74</span>             <span>$this</span>->_writeFile('get_rank_log.txt', <span>$error</span>, 'ab+'<span>);
</span><span> 75</span> <span>        }
</span><span> 76</span>         <span>foreach</span> (<span>$responses</span> <span>as</span> <span>$val</span><span>) {
</span><span> 77</span>             <span>if</span>(!<span>empty</span>(<span>$val</span>['data'<span>])) {
</span><span> 78</span>                 <span>$str</span>.= <span>$this</span>->_get_keyword_link_preg(<span>$val</span>['data'<span>]);
</span><span> 79</span> <span>            }
</span><span> 80</span> <span>        }
</span><span> 81</span>         <span>$str</span> = <span>substr</span>(<span>$str</span>, 0 ,-1<span>);
</span><span> 82</span>         <span>$contents</span> = <span>explode</span>('|', <span>$str</span><span>);
</span><span> 83</span>         <span>return</span> <span>$contents</span><span>;
</span><span> 84</span> <span>    }
</span><span> 85</span> 
<span> 86</span>     <span>//</span><span> 过滤数据 获取链接</span>
<span> 87</span>     <span>private</span> <span>function</span> _get_keyword_link_preg (<span>$str</span><span>) {
</span><span> 88</span>         <span>$res</span> = ''<span>;
</span><span> 89</span>         <span>if</span>(<span>empty</span>(<span>$str</span><span>)) {
</span><span> 90</span>             <span>return</span> ''<span>;
</span><span> 91</span> <span>        }
</span><span> 92</span>         <span>$arr</span> = <span>explode</span>('<div id="web">', <span>$str</span><span>);
</span><span> 93</span>         <span>$arr1</span> = <span>explode</span>('<div id="posS" class="spns">', <span>$arr</span>[1<span>]);
</span><span> 94</span>         <span>$arr2</span> = <span>preg_replace</span>('#<div id=\"pg\">[\s\S]+#', '', <span>$arr1</span>[0<span>]);
</span><span> 95</span>         <span>$arr3</span> = <span>preg_replace</span>('#<div id=\"rel\">[\s\S]+#', '', <span>$arr2</span><span>);
</span><span> 96</span>         <span>$arr4</span> = <span>preg_replace</span>('#<em>[\s\S]+?</em>#', '', <span>$arr3</span><span>);
</span><span> 97</span>         <span>if</span>(<span>preg_match_all</span>('#href=\"(.*?)\">#',<span>$arr4</span>,<span>$arr5</span>) !== <span>false</span><span>) {
</span><span> 98</span>             <span>foreach</span>(<span>$arr5</span>[1] <span>as</span> <span>$val</span><span>) {
</span><span> 99</span>                 <span>$res</span>.= <span>urldecode</span>(<span>$val</span>).'|'<span>;
</span><span>100</span> <span>            }
</span><span>101</span> <span>        }
</span><span>102</span>         <span>return</span> <span>$res</span><span>;
</span><span>103</span> <span>    }
</span><span>104</span> 
<span>105</span>     <span>//</span><span> 写入文件</span>
<span>106</span>     <span>public</span> <span>function</span> _writeFile(<span>$fileName</span>, <span>$data</span>, <span>$method</span>="rb+", <span>$iflock</span>=1, <span>$check</span>=1, <span>$chmod</span>=1<span>){
</span><span>107</span>         <span>$check</span> && @<span>strpos</span>(<span>$this</span>->path.'/'.<span>$fileName</span>, '..')!==<span>false</span> && <span>exit</span>('403 Forbidden!'<span>);
</span><span>108</span>         @<span>touch</span>(<span>$this</span>->path.'/'.<span>$fileName</span><span>);
</span><span>109</span>         <span>$handle</span> = @<span>fopen</span>(<span>$this</span>->path.'/'.<span>$fileName</span>, <span>$method</span><span>);
</span><span>110</span>         <span>if</span>(<span>$iflock</span><span>) {
</span><span>111</span>             @<span>flock</span>(<span>$handle</span>,<span>LOCK_EX);
</span><span>112</span> <span>        }
</span><span>113</span>         <span>$fw</span> = @<span>fwrite</span>(<span>$handle</span>,<span>$data</span><span>);
</span><span>114</span>         <span>if</span>(<span>$method</span> == "rb+") <span>ftruncate</span>(<span>$handle</span>, <span>strlen</span>(<span>$data</span><span>));
</span><span>115</span>         <span>fclose</span>(<span>$handle</span><span>);
</span><span>116</span>         <span>$chmod</span> && @<span>chmod</span>(<span>$this</span>->path.'/'.<span>$fileName</span>,0777<span>);
</span><span>117</span> <span>    }
</span><span>118</span> <span>}
</span><span>119</span> ?>

 

<span> 1</span> <span>function</span> array_unique_fb(<span>$array</span><span>){
</span><span> 2</span>     <span>$temp</span> = <span>array</span><span>();
</span><span> 3</span>     <span>$data</span> = <span>array</span><span>();
</span><span> 4</span>     <span>foreach</span> (<span>$array</span> <span>as</span> <span>$value</span><span>){
</span><span> 5</span>         <span>$value</span> = <span>join</span>(",",<span>$value</span>); <span>//</span><span>降维,也可以用implode,将一维数组转换为用逗号连接的字符串</span>
<span> 6</span>         <span>$temp</span>[] = <span>$value</span><span>;
</span><span> 7</span> <span>    }
</span><span> 8</span>         <span>$temp</span> = <span>array_flip</span>(<span>array_flip</span>(<span>$temp</span>));    <span>//</span><span>去掉重复的字符串,也就是重复的一维数组</span>
<span> 9</span>     <span>foreach</span> (<span>$temp</span> <span>as</span> <span>$k</span> => <span>$value</span><span>){
</span><span>10</span>         <span>$temp</span>[<span>$k</span>] = <span>explode</span>(",",<span>$value</span>);   <span>//</span><span>再将拆开的数组重新组装</span>
<span>11</span> <span>    }
</span><span>12</span>     <span>foreach</span> (<span>$temp</span> <span>as</span> <span>$key</span> => <span>$value</span><span>) {
</span><span>13</span>         <span>$data</span>[<span>$key</span>]['keyword'] = <span>$value</span>[0<span>];
</span><span>14</span>         <span>$data</span>[<span>$key</span>]['domain'] = <span>$value</span>[1<span>];
</span><span>15</span> <span>    }
</span><span>16</span>     <span>return</span> <span>$data</span><span>;
</span><span>17</span> }

 

 

陳述:
本文內容由網友自願投稿,版權歸原作者所有。本站不承擔相應的法律責任。如發現涉嫌抄襲或侵權的內容,請聯絡admin@php.cn