抓取百度关键词排名、标题、连接、描述
转载请标明出处
最近在做百度关键词排名的功能,发现网上资源比较这里写代码片少,于是自己琢磨了一下,写一下笔记;
本文重点在于提供思路,请不要过分依赖,本文主要靠抓取页面标签来完成,如果百度官网将页面标签修改了,请自行修改,如果遇到问题或需要修改的地方请私信我。
鸣谢:本公司SEO提供思路
package cc.test.core;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasParentFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.Div;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeList;
public class KeywordRun {
public int getKeywordRank(String keyword, String url) {
int re = 0;
re = getThisRank("http://www.baidu.com/s?wd=" + keyword, url);
return re;
}
public int getThisRank(String resource, String url) {
int re = -1;
int n = 1;
try {
Parser myParser = new Parser(resource);
myParser.setEncoding("UTF-8");
NodeFilter filter = new AndFilter(new TagNameFilter("DIV"), new HasParentFilter(new AndFilter(new TagNameFilter("DIV"), new HasAttributeFilter("id", "content_left"))));
NodeList nodeList = myParser.extractAllNodesThatMatch(filter);
for (int i = 0; i < nodeList.size(); i++) {
Div table = (Div) nodeList.elementAt(i);
Parser parser = new Parser(table.toHtml());
NodeFilter TitleFilter = new NodeClassFilter(TitleTag.class);
NodeFilter ElementIdFilter = new HasAttributeFilter("class", "c-showurl");
OrFilter orFilter = new OrFilter(TitleFilter, ElementIdFilter);
NodeList list = parser.extractAllNodesThatMatch(orFilter);
LinkTag linkTag = (LinkTag) list.elementAt(0);
String link = HttpUtil.getBaiduFinalLink(linkTag.getLink());
System.out.println(link);
parser = new Parser(table.toHtml());
TitleFilter = new NodeClassFilter(TitleTag.class);
ElementIdFilter = new HasAttributeFilter("class", "t");
orFilter = new OrFilter(TitleFilter, ElementIdFilter);
list = parser.extractAllNodesThatMatch(orFilter);
Tag tag = (Tag) list.elementAt(0);
linkTag = (LinkTag) tag.getChildren().elementAt(0);
parser = new Parser(table.toHtml());
TitleFilter = new NodeClassFilter(TitleTag.class);
ElementIdFilter = new HasAttributeFilter("class", "c-abstract");
orFilter = new OrFilter(TitleFilter, ElementIdFilter);
list = parser.extractAllNodesThatMatch(orFilter);
Div div = (Div) list.elementAt(0);
String _abstract = splitAndFilterString(div.toHtml(),div.toHtml().length());
System.out.println(_abstract);
}
} catch (Exception e) {
e.printStackTrace();
re = -1;
}
return re;
}
public static void main(String[] args) {
KeywordRun run = new KeywordRun();
Integer re = run.getKeywordRank("百度", "www.baidu.com");
}
}
需要用到的外部方法
/**
*
* 获取百度最终连接
*/
public static String getBaiduFinalLink(String link){
BufferedReader in = null;
try {
URL realUrl = new URL(link);
HttpURLConnection connection = (HttpURLConnection)realUrl.openConnection();
connection.setRequestProperty("accept", "*/*");
connection.setRequestProperty("connection", "Keep-Alive");
connection.setRequestProperty("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
connection.connect();
in = new BufferedReader(new InputStreamReader(connection.getInputStream(), "utf-8"));
URL host = connection.getURL();
if(connection.getResponseCode() >= 400){
return null;
}
if (in != null) {
in.close();
}
return host.getHost()+host.getPath();
} catch (Exception e) {
System.out.println("发送GET请求出现异常!" + e);
e.printStackTrace();
}
return null;
}
/**
* 删除input字符串中的html格式
*
* @param input
* @param length
* @return
*/
public static String splitAndFilterString(String input, int length) {
if (input == null || input.trim().equals("")) {
return "";
}
String str = input.replaceAll("\\&[a-zA-Z]{1,10};", "").replaceAll("<[^>]*>", "");
str = str.replaceAll("[(/>)<]", "");
int len = str.length();
if (len <= length) {
return str;
} else {
str = str.substring(0, length);
str += "......";
}
return str;
}
到此结束,
已经获取出百度第一页的所有的连接、标题、简介,如果百度页面有修改,请做相对应的修改即可。
转载请标明出处
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)