完整代码如下(可直接copy):
from bs4 import BeautifulSoup
import urllib.request
import sqlite3
import os
import time
import threading
class Database:
def open(self):
self.con=sqlite3.connect("travels.db")
self.cursor=self.con.cursor()
def close(self):
self.con.commit()
self.con.close()
def initialize(self):
try:
self.cursor.execute("drop table items")
except:
pass
self.cursor.execute("create table items(ID varchar(8) primary key, tDate varchar(16), tTitle varchar(1024), tContent text, tExt varchar(8))")
def insert(self, ID, tDate, tTitle, tContent, tExt):
try:
self.cursor.execute("insert into items (ID,tDate,tTitle,tContent,tExt) values(?, ?, ?, ?, ?)", [ID, tDate, tTitle, tContent, tExt])
except Exception as err:
print(err)
def show(self):
self.cursor.execute("select ID,tDate,tTitle,tContent,tExt from items order by ID")
rows = self.cursor.fetchall()
for row in rows:
print(row[0])
print(row[1])
print(row[2])
print(row[3])
print(row[4])
print()
print("Total", len(rows), "items")
def downloadImage(ID, src, tExt):
try:
req = urllib.request.Request(src, headers=headers)
resp = urllib.request.urlopen(req, timeout=20)
data = resp.read()
imgName = ID + "." + tExt
f = open("download\\" + imgName, "wb")
f.write(data)
f.close()
print("Downloaded " + imgName)
except Exception as err:
print(err)
def downloadContent(url):
content = ""
try:
req = urllib.request.Request(url, headers=headers)
resp = urllib.request.urlopen(req)
html = resp.read().decode()
soup = BeautifulSoup(html, "lxml")
ps = soup.select("div[id='Content'] p")
for p in ps:
content += p.text + "\n"
except Exception as err:
print(err)
return content
def initializeDownload():
if not os.path.exists("download"):
os.mkdir("download")
fs=os.listdir("download")
for f in fs:
os.remove("download\\"+f)
def spider(url):
global page, count, DB, threads
page = page + 1
print("Page", page, url)
try:
req = urllib.request.Request(url, headers=headers)
resp = urllib.request.urlopen(req)
html = resp.read().decode()
soup = BeautifulSoup(html, "lxml")
divs = soup.select("div[class='lft_art lf'] div[class='mb10 tw3_01_2']")
for div in divs:
tTitle = div.select_one("span h4").text
tDate = div.select_one("span b").text
count = count + 1
ID = "%06d" % count
img = div.select_one("span a img")
src = ""
tExt = ""
if img:
src = urllib.request.urljoin(url, img["src"])
p = src.rfind(".")
if p >= 0:
tExt = src[p + 1:]
T = threading.Thread(target=downloadImage, args=[ID, src, tExt])
T.start()
threads.append(T)
link = div.select_one("span h4 a")["href"]
link = urllib.request.urljoin(url, link)
tContent = downloadContent(link)
DB.insert(ID, tDate, tTitle, tContent, tExt)
nextUrl = ""
links = soup.select("div[id='div_currpage'] a[class='pagestyle']")
for link in links:
if link.text == "Next":
href = link["href"]
if href.startswith("//www."):
nextUrl = "http:" + href
else:
nextUrl = urllib.request.urljoin(url, href)
break
if nextUrl:
spider(nextUrl)
except Exception as err:
print(err)
headers = {"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0)AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 32.0.1664.3Safari / 537.36"}
while True:
print("1.Spider")
print("2.Show")
print("3.Exit")
s = input("Please enter(1,2,3):")
if s == "1":
initializeDownload()
threads = []
page = 0
count = 0
DB = Database()
DB.open()
DB.initialize()
spider(url="http://www.chinadaily.com.cn/travel/citytours")
DB.close()
for T in threads:
T.join()
print("Total %d pages, %d items" % (page, count))
elif s == "2":
DB = Database()
DB.open()
DB.show()
DB.close()
else:
break
运行结果:
爬取到的图片: