一、PHPCrawler的介绍与安装
先了解一下什么是抓取?
抓取就是网络爬虫,也就是人们常说的网络蜘蛛(spider)。是搜索引擎的一个重要组成部分,按照一定的逻辑和算法抓取和下载互联网上的信息和网页。一般的爬虫从一个start url开始,按照一定的策略开始爬取,把爬取到的新的url放入爬取队列中,然后进行新一轮的爬取,直到抓取完毕为止。
PHPCrawler是一个国外开源的爬虫系统,它的源码托管在sourceforge里,这是它的下载地址:点击打开链接
,根据自己电脑里安装的PHP版本选择合适的版本下载。下载完毕之后,解压到服务器网站根目录下,复制example.php文件,并重命名。
二、完整源码
<?php
// It may take a whils to crawl a site ...
set_time_limit(10000);
// Inculde the phpcrawl-mainclass
include("libs/PHPCrawler.class.php");
// Extend the class and override the handleDocumentInfo()-method
class MyCrawler extends PHPCrawler
{
//在这里解析页面内容
function handleDocumentInfo($DocInfo)
{
// Just detect linebreak for output ("\n" in CLI-mode, otherwise "<br>").
if (PHP_SAPI == "cli") $lb = "\n";
else $lb = "<br />";
// Print the URL and the HTTP-status-Code
echo "Page requested: ".$DocInfo->url." (".$DocInfo->http_status_code.")".$lb;
// Print the refering URL
echo "Referer-page: ".$DocInfo->referer_url.$lb;
// Print if the content of the document was be recieved or not
if ($DocInfo->received == true)
echo "Content received: ".$DocInfo->bytes_received." bytes".$lb;
else
echo "Content not received".$lb;
// Now you should do something with the content of the actual
// received page or file ($DocInfo->source), we skip it in this example
//echo $DocInfo->source;
//echo $lb;
$url=$DocInfo->url;
$pat="/http:\/\/www\.kugou\.com\/yy\/special\/single\/\d+\.html/";
if(preg_match($pat,$url)>0){
$this->parseSonglistDetail($DocInfo);
}
flush();
}
public function parseSonglistDetail($DocInfo){
$songlistArr=array();
$songlistArr['raw_url']=$DocInfo->url;
$content=$DocInfo->content;
//名称
$matches=array();
$pat="/<span>名称:<\/span>([^(<br)]+)<br \/>/";
$res=preg_match($pat, $content,$matches);
if($res>0){
$songlistArr['title']=$matches[1];
}else{
$songlistArr['title']="";
print "error:get title fail<br/>";
}
//创建人
$matches=array();
$pat="/<span>创建人:<\/span>([^(<br)]+)<br \/>/";
$res=preg_match($pat, $content,$matches);
if($res>0){
$songlistArr['creator']=$matches[1];
}else{
$songlistArr['creator']="";
print "error:get creator fail<br/>";
}
//创建时间
$matches=array();
$pat="/<span>更新时间:<\/span>([^(<br)]+)<br \/>/";
$res=preg_match($pat, $content,$matches);
if($res>0){
$songlistArr['create_date']=$matches[1];
}else{
$songlistArr['create_date']="";
print "error:get create_date fail<br/>";
}
//简介
$matches=array();
$pat="/<span>简介:<\/span>([^(<\/p)]*)<\/p>/";
$res=preg_match($pat, $content,$matches);
if($res>0){
$songlistArr['info']=$matches[1];
}else{
$songlistArr['info']="";
print "error:get info fail<br/>";
}
//歌曲
$matches=array();
$pat="/<a title=\"([^\"]+)\" hidefocus=\"/";
$res=preg_match_all($pat, $content,$matches);
if($res>0){
$songlistArr['songs']=array();
for($i=0;$i<count($matches[1]);$i++){
$song_title=$matches[1][$i];
array_push($songlistArr['songs'],array('title'=>$song_title));
}
}else{
$songlistArr['song']="";
print "error:get song fail<br/>";
}
echo "<pre>";
print_r($songlistArr);
echo "</pre>";
$this->saveSonglist($songlistArr);
}
public function saveSonglist($songlistArr){
//连接数据库
$conn=mysql_connect("localhost","root","root");
mysql_select_db("songlist",$conn);
mysql_query("set names utf8");
$songlist=array();
$songlist['title']=mysql_escape_string($songlistArr['title']);
$songlist['create_time']=mysql_escape_string($songlistArr['create_date']);
$songlist['creator']=mysql_escape_string($songlistArr['creator']);
$songlist['raw_url']=mysql_escape_string($songlistArr['raw_url']);
$songlist['info']=mysql_escape_string($songlistArr['info']);
$sql="insert into songlist set".
"title=''".$songlist['title']."'".
",creat_time=''".$songlist['create_time']."'".
",creator=''".$songlist['creator']."'".
",raw_url=''".$songlist['raw_url']."'".
",info=''".$songlist['info']."';";
mysql_query($sql,$conn);
$songlist_id=mysql_insert_id();
foreach($songlistArr['songs'] as $song){
$title=mysql_escape_string($song['title']);
$sql="insert into song set title='".$title."'" .",songlist_id=".$songlist_id.";";
mysql_query($sql);
}
mysql_close($conn);
}
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
//创建一个爬虫
$crawler = new MyCrawler();
//设置一个开始的连接
// URL to crawl
$start_url="www.kugou.com/yy/special/index/1-0-2.html";
$crawler->setURL($start_url);
//设置内容的类型
// Only receive content of files with content-type "text/html"
$crawler->addContentTypeReceiveRule("#text/html#");
//忽略图片,设置那些连接不需要下载
//每一个精选集的连接
$crawler->addURLFollowRule("#http://www\.kugou\.com/yy/special/single/\d+\.html# i");//i 忽略大小写
//精选集页面的链接 下一页
$crawler->addURLFollowRule("#http://www\.kugou\.com/yy/special/index/\d+-0-2.html# i");
// Ignore links to pictures, dont even request pictures
$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
//数据内容的容量,多少m,0是无限的
$crawler->setTrafficLimit(1000 * 1024);
// Thats enough, now here we go
$crawler->go();
// At the end, after the process is finished, we print a short
// report (see method getProcessReport() for more information)
$report = $crawler->getProcessReport();
if (PHP_SAPI == "cli") $lb = "\n";
else $lb = "<br />";
echo "Summary:".$lb;
echo "Links followed: ".$report->links_followed.$lb;
echo "Documents received: ".$report->files_received.$lb;
echo "Bytes received: ".$report->bytes_received." bytes".$lb;
echo "Process runtime: ".$report->process_runtime." sec".$lb;
?>
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)