jsoup jar包 1.11.2
链接: https://pan.baidu.com/s/1pe3-r5_YB-pGEsosfRLbsA?pwd=41w5
提取码: 41w5
效果:
代码:
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
public static void main(String[] args) {
try {
String parentUrl = "XXXX";
URL indexUrl = new URL("XXXX");
Document indexHtml = Jsoup.parse(indexUrl, 10000);
//获取分类节点
Elements fenleis = indexHtml.getElementsByClass("classify clearfix");
for (Element fenLei : fenleis) {
Elements fenLeiA = fenLei.select("a");
for (int i = 0; i < fenLeiA.size(); i++) {
Element fenLeiA1 = fenLeiA.get(i);
Elements a1 = fenLeiA1.select("a");
String fenLeiHref = a1.select("a").attr("href");
String fenLeiName = a1.select("a").attr("title");
try {
//本地盘符
File file = new File("E:\\worm\\" + fenLeiName + "\\1.txt");
//创建文件夹
file.mkdirs();
} catch (Exception e) {
}
//每个类别爬取10页
for (int j = 1; j < 10; j++) {
URL fenLeiIndexUrl = new URL("https://xxxxxxx");
if (j != 1) {//第二页开始的url规则
fenLeiIndexUrl = new URL(parentUrl + fenLeiHref + "index_" + j + ".html");
} else {
fenLeiIndexUrl = new URL(parentUrl + fenLeiHref + "index.html");
}
Document doc = Jsoup.parse(fenLeiIndexUrl, 10000);
Elements clearfixList = doc.getElementsByClass("clearfix");
for (Element e : clearfixList) {
Elements lis = e.getElementsByTag("li");
for (Element li : lis) {
Elements a = li.select("a");
String href = a.select("a").attr("href");
URL url2 = new URL(parentUrl + href);
Document doc1 = Jsoup.parse(url2, 10000);
Elements img = doc1.getElementsByClass("photo-pic");
Element first = img.first();
Elements imgUrla = first.select("img");
String imgsrc = imgUrla.select("img").attr("src");
String title = imgUrla.select("img").attr("title");
title = title.replace(" ", "");
//文件名过滤
title = title.replaceAll("[*]", "");
title = title.replaceAll(" ", "");
title = title.replaceAll("[?]", "");
title = title.replaceAll("[|]", "");
download(parentUrl + imgsrc, fenLeiName, title);
}
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static void download(String urlString, String dirsName, String imgName) throws Exception {
// 构造URL
URL url = new URL(urlString);
// 打开连接
URLConnection con = url.openConnection();
// 输入流
InputStream is = con.getInputStream();
// 1K的数据缓冲
byte[] bs = new byte[1024];
// 读取到的数据长度
int len;
// 输出的文件流
String filename = "E:\\worm\\" + dirsName + "\\" + imgName + ".jpg"; //下载路径及下载图片名称
File file = new File(filename);
FileOutputStream os = new FileOutputStream(file, true);
// 开始读取
while ((len = is.read(bs)) != -1) {
os.write(bs, 0, len);
}
// 完毕,关闭所有链接
os.close();
is.close();
}