php实现爬虫
python是爬虫的第一选择,这里用php原生来实现,php也有对应的框架,平时不考虑效率可以自己写。
一、最简单的爬虫就是直接读取网页,然后从网页匹配出想要的东西。比如居然搞笑,嘻哈网。下面是居然搞笑抓取代码:
<?php
set_time_limit(0);
ini_set ('memory_limit', '128M');
header("Content-Type: text/html;charset=utf-8");
define('HOST','localhost');
define('USER','root');
define('PWD','root');
define('DB','test');
// 初始化
// header("Content-Type: text/html;charset=utf-8");
//
function db_connect()
{
$con = mysql_connect(HOST,USER,PWD);
mysql_select_db(DB);
mysql_query('set names utf8',$con);
return $con;
}
//
function do_url()
{
$url = "http://www.zbjuran.com/quweitupian/";
for($i = 0;$i < 500;$i++)
{
$data = curl_get($url);
$flag = get_data($data);
if($flag == 1)
{
$page = $i + 2;
$url = "http://www.zbjuran.com/quweitupian/";
$url = $url."list_2_".$page.".html";
}
}
}
function curl_get($url)
{
$ch=curl_init();
curl_setopt($ch,CURLOPT_URL,$url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch,CURLOPT_HEADER,0);
$result=curl_exec($ch);
$code=curl_getinfo($ch,CURLINFO_HTTP_CODE);
if($code!='404' && $result){
preg_match_all('/<h3><a href=".*?".*?<b>(.*?)<\/b>[\s\S]*?src="(.*?)"/',$result,$mc);
return $mc;
}
curl_close($ch);
}
function get_data($data)
{
for ($i=0; $i < count($data[0]); $i++) {
$temp = $data[0][$i];
$title = preg_replace('/<h3><a href=".*?".*?<b>(.*?)<\/b>[\s\S]*?src="(.*?)"/','$1',$temp);
$img = preg_replace('/<h3><a href=".*?".*?<b>(.*?)<\/b>[\s\S]*?src="(.*?)"/','$2',$temp);
$title = iconv('GB2312', 'UTF-8//IGNORE',htmlspecialchars($title));//这里是因为网站编码原因需要转换
$img = iconv('GB2312', 'UTF-8//IGNORE',htmlspecialchars($img));
$data1 = array(
'img' => $img,
'title' => $title,
'from' => 'juran'
);
do_insert($data1);
}
return 1;
}
function do_insert($data)
{
$con = db_connect();
$ctime = date('Y-m-d H:i:s',time());
$title = $data['title'];
$img = $data['img'];
$from = $data['from'];
$sql = "insert into article_news_thrid(title,cover,createdAt,`from`) values('$title','$img','$ctime','$from')";
$result = mysql_query($sql,$con);
}
do_url();
exit;
二、第二种是请求某一个接口返回相应的数据,这种不能像上面一样直接抓取,需要找出请求的规律和接口。比如多玩,区别代码如下:
function do_url()
{
$url = "http://tu.duowan.com/m/gaoxiao?offset=30&order=created&math=0.09469478387750963";//这是请求接口,而不是首页链接
for($i = 0;$i < 10;$i++)
{
$data = curl_get($url);
$flag = get_data($data);
if($flag == 1)
{
$offset = ($i + 2)*30;
$url = "http://tu.duowan.com/m/gaoxiao?offset=";
$url = $url.$offset."&order=created&math=0.09469478387750963";
}
}
}
.......
三、第三种是需要登录才能抓取,可以使用模拟登录获取cookie,还可以先正常登录上去,然后去找cookie在后面的请求上加上就好,比如:知乎,微博,代码如下:
$url = 'http://weibo.com/u/5875206453/home?wvr=5&lf=reg';
$userAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36";
$cook = 'SINAGLOBAL=8612854206507.637.1500347246746; UOR=,,www.china-pub.com; YF-V5-G0=73b58b9e32dedf309da5103c77c3af4f; YF-Ugrow-G0=b02489d329584fca03ad6347fc915997; _s_tentry=-; Apache=2569806581226.6855.1500888849747; ULV=1500888849754:3:3:1:2569806581226.6855.1500888849747:1500606517658; appkey=; login_sid_t=f7ffc067fbae863728c33cc48876dd6b; WBStorage=cd7f674a73035f73|undefined; SSOLoginState=1500892373; SCF=AphjCI-_sSVp7z3OP41QVZB43FQ8GGXRDAEgQhm8n5ZYbdlpKAsepWKo5BbwlN7jxC62ko3Eg-hM61nPoSFVT0Y.; SUB=_2A250cbyGDeRhGeNG7FcT8CjIzj-IHXVXBqlOrDV8PUNbmtBeLUTzkW8HlK2p-ctoZOlDRy_vdepe2rIOdQ..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WF_bqgULG1xFWf4D23BueOp5JpX5K2hUgL.Fo-RS0-EehqXSKe2dJLoIEBLxKML1K.LB.BLxKnLBK2LBKeLxK-L12qL12zLxKqLB.-L1hnt; SUHB=0BLoH6RnCVU3eS; ALF=1532428373; un=15700353368';
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
//curl_setopt($ch,CURLOPT_REFERER,$url);
curl_setopt($ch,CURLOPT_USERAGENT,$userAgent);
curl_setopt($ch,CURLOPT_COOKIE,$cook);
$result = curl_exec($ch);
curl_close($ch);
return $result;