sscanf从一个格式化字符串中读取输入。功能跟正则类似
<?php
$str = "Hi, I'm 25 years old.";
sscanf($str, "Hi, I'm %d years old.", $age);
echo "Age: $age\n";
$str = "My name is John , and I'm a male.";
sscanf($str, "My name is %s , and I'm a %s.", $name, $gender);
echo "Name: $name, Gender: $gender\n";
?>
输出结果:
Age: 25
Name: John, Gender: male
正则三段论:定锚点,去噪点,取数据。不关心的部分就去掉,关心的部分用正则定锚点取出来
采集标题和链接
$string = <<<EOT
<ul class="textList textListBig">
<li><a href="/learn/article/21707">为宝宝记录成长每一刻</a></li>
<li><a href="/learn/article/21705">细数与宝宝树的情愫</a></li>
<li><a href="/learn/article/21693">备孕最忌讳的11件事情</a></li>
<li><a href="/learn/article/21682">经营幸福家庭的六大秘诀</a></li>
</ul>
EOT;
//正则匹配<li>获取标题和地址
preg_match_all ("/<li><a href=\"\/learn\/article\/(.*)\">(.*)<\/a>/",$string, $out, PREG_SET_ORDER);
foreach($out as $key => $value){
$article['title'][] = $out[$key][2];
$article['link'][] = "http://www.babytree.com/learn/article/".$out[$key][1];
}
$content = <<<EOT
<div class="txt">
<h2><a class="color_black" href="http://new.qq.com/omn/20180112A0EB7G.html" target="_blank">一台苹果iPhone到底能赚多少钱?是小米手机的80倍</a></h2>
</div>
EOT;
$data = array();
$data_cnt = 0;
$matches = array();
$pattern = '/<div class="txt">.*?href="(.*?)".*?>(.*?)<\/a>/s';
preg_match($pattern, $content, $matches);
$data[$data_cnt]['url'] = $matches[1];
$data[$data_cnt++]['intro'] = $matches[2];
取新闻列表可以以发现每个标签都有一个新闻标签都是由“Q-tpListInner”的div包起来的,并且我们要取出的url 在a标签的href中, 要取的新闻标题在在a标签的title中,这就是传说的“定锚点、去噪点”的过程了;
$content=<<<EOT
<div class="Q-tpList">
<div class="Q-tpListInner">
<a target="_blank" href="http://tech.qq.com/a/20180112/023094.htm" class="pic"> <img class="zutu0" src="http://inews.gtimg.com/newsapp_ls/0/2690086283_300240/0"></a>
<div class="itemtxt itemtxt0">
<h3 class="f18 l26">
<a target="_blank" href="http://tech.qq.com/a/20180112/023094.htm" title="途牛宣布一亿美元股票回购计划及CTO任命">途牛宣布一亿美元股票回购计划及CTO任命</a>
</h3>
<div class="timelabel">
<span class="aTime">01月12日 16:38更新</span>
<span class="techTag" style="display:inline-block">标签:
<em><a class="columnlist" title="途牛" href="http://tech.qq.com/clear_article_qq/tag_article_list.htm?tags=%E9%80%94%E7%89%9B" target="_blank">途牛</a><a class="columnlist" title="回购" href="http://tech.qq.com/clear_article_qq/tag_article_list.htm?tags=%E5%9B%9E%E8%B4%AD" target="_blank">回购</a></em>
</span>
</div>
<div class="newsinfo cf">
<div class="operate" style="">
<div class="chupin">腾讯科技</div>
<div class="shareTo" style="top:0;">
<div class="shareBtn" onmouseover="shareshow(this)" onmouseout="sharehide(this)">
<span class="shareshowbtn"></span>
<div class="share" style="display: none;" bosszone="kjsy_share">
<a onclick="postToWb(this.name,this.href,this.id); return false;" title="分享到微博" class="sharewb" id="http://tech.qq.com/a/20180112/023094.htm" name="途牛宣布一亿美元股票回购计划及CTO任命" href="http://inews.gtimg.com/newsapp_ls/0/2690086283_150120/0">分享到微博</a>
<a onclick="postToQzone(this.name,'',this.href,this.id); return false;" title="分享到QQ空间" class="shareqzone" id="http://tech.qq.com/a/20180112/023094.htm" name="途牛宣布一亿美元股票回购计划及CTO任命" href="http://inews.gtimg.com/newsapp_ls/0/2690086283_150120/0">分享到空间</a>
<a href="javascript:void(0)" onclick="shareToSina(this.name,this.id); return false;" title="分享到新浪微博" class="sharesina" id="http://tech.qq.com/a/20180112/023094.htm" name="途牛宣布一亿美元股票回购计划及CTO任命">分享到新浪微博</a>
<a onclick="postToQQEmail(this.name,'',this.id,this.href); return false;" title="分享到QQ邮箱" class="shareqqemail" id="http://tech.qq.com/a/20180112/023094.htm" name="途牛宣布一亿美元股票回购计划及CTO任命" href="http://inews.gtimg.com/newsapp_ls/0/2690086283_150120/0">分享到QQ邮箱</a>
<a onclick="shareToQQ(this.name,this.href,this.id); return false;" title="分享到QQ好友" class="sharepengyou" id="http://tech.qq.com/a/20180112/023094.htm" name="途牛宣布一亿美元股票回购计划及CTO任命" href="http://inews.gtimg.com/newsapp_ls/0/2690086283_150120/0">分享到QQ好友</a>
<a href="javascript:void(0)" onclick="shareToRenren(this.name,this.id); return false;" title="分享到人人" class="sharerenren" id="http://tech.qq.com/a/20180112/023094.htm" name="途牛宣布一亿美元股票回购计划及CTO任命">分享到人人</a>
<a href="javascript:void(0)" onclick="shareToKaixin(this.name,this.id); return false;" title="分享到开心" class="sharekaixin" id="http://tech.qq.com/a/20180112/023094.htm" name="途牛宣布一亿美元股票回购计划及CTO任命">分享到开心</a>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
<div class="Q-tpList">
<div class="Q-tpListInner">
<a target="_blank" href="http://new.qq.com/omn/20180112A0CNKT.html" class="pic"> <img class="zutu0" src="http://inews.gtimg.com/newsapp_ls/0/2688353285_300240/0"></a>
<div class="itemtxt itemtxt0">
<h3 class="f18 l26">
<a target="_blank" href="http://new.qq.com/omn/20180112A0CNKT.html" title="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步">王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步</a>
</h3>
<div class="timelabel">
<span class="aTime">01月12日 13:17更新</span>
<span class="techTag" style="display:inline-block">标签:
<em><a class="columnlist" title="周鸿祎" href="http://tech.qq.com/clear_article_qq/tag_article_list.htm?tags=%E5%91%A8%E9%B8%BF%E7%A5%8E" target="_blank">周鸿祎</a><a class="columnlist" title="王思聪" href="http://tech.qq.com/clear_article_qq/tag_article_list.htm?tags=%E7%8E%8B%E6%80%9D%E8%81%AA" target="_blank">王思聪</a><a class="columnlist" title="美团" href="http://tech.qq.com/clear_article_qq/tag_article_list.htm?tags=%E7%BE%8E%E5%9B%A2" target="_blank">美团</a></em>
</span>
</div>
<div class="newsinfo cf">
<div class="operate" style="">
<div class="chupin">IT桔子</div>
<div class="shareTo" style="top:0;">
<div class="shareBtn" onmouseover="shareshow(this)" onmouseout="sharehide(this)">
<span class="shareshowbtn"></span>
<div class="share" style="display: none;" bosszone="kjsy_share">
<a onclick="postToWb(this.name,this.href,this.id); return false;" title="分享到微博" class="sharewb" id="http://new.qq.com/omn/20180112A0CNKT.html" name="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步" href="http://inews.gtimg.com/newsapp_ls/0/2688353285_150120/0">分享到微博</a>
<a onclick="postToQzone(this.name,'',this.href,this.id); return false;" title="分享到QQ空间" class="shareqzone" id="http://new.qq.com/omn/20180112A0CNKT.html" name="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步" href="http://inews.gtimg.com/newsapp_ls/0/2688353285_150120/0">分享到空间</a>
<a href="javascript:void(0)" onclick="shareToSina(this.name,this.id); return false;" title="分享到新浪微博" class="sharesina" id="http://new.qq.com/omn/20180112A0CNKT.html" name="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步">分享到新浪微博</a>
<a onclick="postToQQEmail(this.name,'',this.id,this.href); return false;" title="分享到QQ邮箱" class="shareqqemail" id="http://new.qq.com/omn/20180112A0CNKT.html" name="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步" href="http://inews.gtimg.com/newsapp_ls/0/2688353285_150120/0">分享到QQ邮箱</a>
<a onclick="shareToQQ(this.name,this.href,this.id); return false;" title="分享到QQ好友" class="sharepengyou" id="http://new.qq.com/omn/20180112A0CNKT.html" name="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步" href="http://inews.gtimg.com/newsapp_ls/0/2688353285_150120/0">分享到QQ好友</a>
<a href="javascript:void(0)" onclick="shareToRenren(this.name,this.id); return false;" title="分享到人人" class="sharerenren" id="http://new.qq.com/omn/20180112A0CNKT.html" name="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步">分享到人人</a>
<a href="javascript:void(0)" onclick="shareToKaixin(this.name,this.id); return false;" title="分享到开心" class="sharekaixin" id="http://new.qq.com/omn/20180112A0CNKT.html" name="王思聪、周鸿祎疯狂撒币的背后,是问答模式商业化的第一步">分享到开心</a>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
EOT;
$data_cnt = 0;
$matches = array();
$pattern = '/Q-tpListInner.*?href="(.*?)".*? title="(.*?)">/s';
preg_match_all($pattern, $content, $matches);
for ($i = 0; $i < count($matches[1]); $i++) {
$data[$data_cnt]['url'] = $matches[1][$i];
$data[$data_cnt++]['intro'] = $matches[2][$i];
}
分页采集
$pageCode_source = <<<EOT
<ul class="corp_info">
<li class="h_com_list clearfix">
<div class="h_com_info">
<h3><a href="http://ccmhw.qipei8.com" target="_blank">长春马宏伟汽车用品销售有限公司</a></h3>
<div class="h_introduce clearfix">
<ol class="h_product">
<li>
<span><img src="http://img.qipei8.com/fen.gif" title="汽配指数" align="absmiddle"></span> <font color="#ff6600">36</font>
</li>
<li>
<span>电话:</span>86-0431-1335154-2227
</li>
<li>地址:长春市 绿园区锦程大街355号景程苑1-3号金东方汽车用品采购基地2-10</li>
</ol>
<i></i>
<ol class="h_com_time">
<li> 吉林 长春</li>
</ol>
<div class="h_com_btn"><a href="http://ccmhw.qipei8.com/contact.html" target="_blank" class="h_contact">查看联系方式</a><a href="http://ccmhw.qipei8.com/product.html" target="_blank" class="h_pro_cen">进入产品中心</a></div>
</div>
</div>
<ul class="h_product_pic">
<li class="h_product_pic_l">经销商</li>
<li class="h_product_pic_r">
<div>
<a href="http://ccmhw.qipei8.com/product.html" target="_blank">
查看更多产品>>
</a>
</div>
</li>
</ul>
</li>
</ul>
EOT;
//当前页码有无公司数据
$rege_for_gongsi = '/<ul[\s]+class="corp_info">[a-zA-Z_0-9-\s\S]+<\/ul>/i';
preg_match_all($rege_for_gongsi, $pageCode_source, $rege_for_gongsi_ms);
while (!empty(current($rege_for_gongsi_ms))) {
// 调用方法解析联系我们页面数据
getPageData($pageCode_source);
}
$table = <<<EOT
<table class="tab-item" width="656" cellspacing="0" cellpadding="0" align="center">
<tbody>
<tr>
<th width="119">公司名称</th>
<td width="">长春马宏伟汽车用品销售有限公司</td>
</tr>
<tr>
<th>联系人</th>
<td>王兴莲</td>
</tr>
<tr>
<th>职位</th>
<td>经理</td>
</tr>
<tr>
<th>电话</th>
<td>86-0431-1335154-2227</td>
</tr>
<tr>
</tr>
<tr>
<th>手机</th>
<td>13351542227</td>
</tr>
<tr>
<th>邮箱</th>
<td>ccmhw@qipei8.com</td>
</tr>
<tr>
<th>地址</th>
<td>长春市 绿园区锦程大街355号景程苑1-3号金东方汽车用品采购基地2-10</td>
</tr>
<tr>
<th>公司主页</th>
<td>ccmhw.qipei8.com</td>
</tr>
</tbody>
</table>
EOT;
function getPageData($pageCode_source) {
// 首先获取查看联系方式
$rege_for_lianxifangshi = '/(<a[\s]+href="(.+)"[\s]+target="_blank"[\s]+class="h_contact">.+<\/a>)/i';
preg_match_all($rege_for_lianxifangshi, $pageCode_source, $rege_for_lianxifangshi_ms);
// 获取公司类型
$rege_for_company_type = '/"h_product_pic_l"[\s]+>(.+)<\/li>/i';
preg_match_all($rege_for_company_type, $pageCode_source, $rege_for_company_type_ms);
$rege_for_company_type_ms = $rege_for_company_type_ms[1];
// 获取到每一个公司的联系我们主页的table表
$all_rege_for_detail_ms = array();
// 对公司联系方式页面URL进行循环
foreach ($rege_for_lianxifangshi_ms[2] as $key => $value) {
// 1. 初始化
$ch = curl_init();
// 2. 设置选项,包括URL
curl_setopt($ch, CURLOPT_URL, $value);
// 设置获取到内容不直接输出到页面上
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
// CURLLOPT_HEADER设置为0表示不返回HTTP头部信息
curl_setopt($ch, CURLOPT_HEADER, 0);
// 3. 执行并获取HTML文档内容
$rege_for_detail_ms_pageCode_source = curl_exec($ch);
//关闭
curl_close($ch);
// 获取联系方式页面的table
$rege_for_detail = '/<table[\s]+width="656"[\s]+cellpadding="0"[\s]+cellspacing="0"[\s]+align="center"[\s]+class="tab-item">([a-zA-Z_0-9-\s\S]+)<\/table>/i';
preg_match_all($rege_for_detail, $rege_for_detail_ms_pageCode_source, $rege_for_detail_ms);
$all_rege_for_detail_ms[] = current($rege_for_detail_ms[1]);
}
// 用于存储所有公司信息数组array(1=>array(),2=>array())
$all_company_msg = array();
// 对整个table表进行分段匹配,有邮箱就提取邮箱,有电话提取电话
foreach ($all_rege_for_detail_ms as $key => $value) {
// 存储公司类型,因为$key的相应对应是并没有变化的
$all_company_msg[$key]['conpany_type'] = $rege_for_company_type_ms[$key];
// 获取公司名称
$rege = '/<td[\s\S]+width="">(.+)<\/td>/i';
preg_match_all($rege, $value, $company_msg);
$all_company_msg[$key]['company_name'] = current($company_msg[1]);
// 获取联系人
$rege = '/联系人<\/th>[\s]+<td>(.+)<\/td>/i';
preg_match_all($rege, $value, $company_msg);
$all_company_msg[$key]['company_contactman'] = current($company_msg[1]);
// 获取职位
$rege = '/职位<\/th>[\s]+<td>(.+)<\/td>/i';
preg_match_all($rege, $value, $company_msg);
$all_company_msg[$key]['company_job'] = current($company_msg[1]);
// 获取电话
$rege = '/电话<\/th>[\s]+<td>(.+)<\/td>/i';
preg_match_all($rege, $value, $company_msg);
$all_company_msg[$key]['company_phone'] = current($company_msg[1]);
// 获取传真
$rege = '/传真<\/th>[\s]+<td>(.+)<\/td>/i';
preg_match_all($rege, $value, $company_msg);
$all_company_msg[$key]['company_fax'] = current($company_msg[1]);
// 获取手机
$rege = '/手机<\/th>[\s]+<td>(.+)<\/td>/i';
preg_match_all($rege, $value, $company_msg);
$all_company_msg[$key]['company_mobile'] = current($company_msg[1]);
// 获取邮箱
$rege = '/邮箱<\/th>[\s]+<td>(.+)<\/td>/i';
preg_match_all($rege, $value, $company_msg);
$all_company_msg[$key]['company_email'] = current($company_msg[1]);
// 获取地址
$rege = '/地址<\/th>[\s]+<td>(.+)<\/td>/i';
preg_match_all($rege, $value, $company_msg);
$all_company_msg[$key]['company_address'] = current($company_msg[1]);
// 获取邮编
$rege = '/邮编<\/th>[\s]+<td>(.+)<\/td>/i';
preg_match_all($rege, $value, $company_msg);
$all_company_msg[$key]['company_postcode'] = current($company_msg[1]);
}
//todo insert db
}
获取table中的td数据
$div = <<<EOR
<div class="de_170822_d01_d">
<table>
<tbody>
<tr>
<td>
<span>公司中文名: </span>
</td>
<td>
<span>中兵通信科技股份有限公司</span>
</td>
<td>
<span>注册资本: </span>
</td>
<td>
<span>192150000元</span>
</td>
</tr>
<tr>
<td>
<span>注册地址: </span>
</td>
<td>
<span>河南省新乡市工业园区纬七路760号</span>
</td>
<td>
<span>法人代表: </span>
</td>
<td>
<span>浮德海</span>
</td>
</tr>
<tr>
<td>
<span>成立时间: </span>
</td>
<td>
<span>1997-12-03</span>
</td>
<td>
<span>官方联系方式: </span>
</td>
<td>
<span>0373-6358301</span>
<a href="javascript:;" class="de_170822_d01_d_a01">联系创始人</a>
</td>
</tr>
</tbody>
</table>
</div>
EOR;
$html = preg_replace("/<(a.*?)>(.*?)<(\/a.*?)>/si", "", $div); //过滤a标签
$html = preg_replace("/<(\/?a.*?)>/si", "", $html); //过滤a标签
$html = get_tag_data($html, '<div class="de_170822_d01_d">', '</div>');
$html = get_td_array($html);