爬取3499手游网下载地址信息
爬取游戏的下载地址和信息,爬取的信息存入到数据库中。
1、首先需要安装第三方库
requests,lxml,MySQLdb
2、先创建down_software数据库,创建youxi表
create table down_software.youxi
(
id int auto_increment
primary key,
yx_title varchar(255) null,
yx_bsc varchar(255) null,
yx_os varchar(255) null,
yx_type varchar(255) null,
comment_str text null,
yx_jietu text null,
yx_down_str varchar(255) null,
yx_head_portrait_url varchar(255) null,
url varchar(255) null
)
engine = MyISAM;
3、要爬取的网站https://www.34347.com,直接贴代码。
import requests
from lxml import etree
import MySQLdb
import time
def mySql(yx_title, yx_bsc, yx_os, yx_type, comment_str, yx_jietu, yx_down_str, yx_head_portrait_url, url):
db = MySQLdb.connect("localhost", "root", "root", "down_software", charset='utf8')
cursor = db.cursor()
sql = """INSERT INTO youxi(yx_title, yx_bsc, yx_os, yx_type, comment_str, yx_jietu, yx_down_str, yx_head_portrait_url, url)
VALUES ('""" + yx_title + """', '""" + yx_bsc + """', '""" + yx_os + """',' """ + yx_type + """',' """ + comment_str + """',' """ + yx_jietu + """',' """ + yx_down_str + """',' """ + yx_head_portrait_url + """',' """ + url + """')"""
try:
cursor.execute(sql)
db.commit()
except:
db.rollback()
db.close()
def down_info(url_i):
for i in url_i:
url = 'https://www.34347.com' + i
h1 = requests.get(url, data=data, headers=headers)
print(url)
html_info = etree.HTML(h1.text)
yx_title = html_info.xpath('/html/body/div[1]/div[4]/div[3]/div[1]/div/div[1]/dl/dt/h3/text()')
if len(yx_title) > 0:
yx_title = html_info.xpath('/html/body/div[1]/div[4]/div[3]/div[1]/div/div[1]/dl/dt/h3/text()')[0]
else:
break
re = check_url(yx_title)
if len(re) > 0:
break
yx_head_portrait_url = html_info.xpath('/html/body/div[1]/div[4]/div[3]/div[1]/div/div[1]/div[1]/img/@src')[
0]
yx_down = html_info.xpath('/html/body/div[1]/div[4]/div[3]/div[1]/div/div[1]/dl/dd[3]/a[.]/@href')
yx_bsc = html_info.xpath(
'/html/body/div[1]/div[4]/div[3]/div[1]/div/div[1]/dl/dd[2]/p[1]/span[.]/em/text()')
yx_os = html_info.xpath(
'/html/body/div[1]/div[4]/div[3]/div[1]/div/div[1]/dl/dd[2]/p[2]/span[1]/em[.]/text()')
yx_type = html_info.xpath('/html/body/div[1]/div[4]/div[3]/div[1]/div/div[1]/dl/dd[2]/p[2]/span[2]/em/text()')[
0]
yx_comment = html_info.xpath(
'/html/body/div[1]/div[4]/div[3]/div[1]/div/div[2]/div[2]/div[1]/div[1]/p[.]/text()')
yx_jietu = html_info.xpath('//*[@id="x_img_viewer"]/div/ul/li[.]/img/@src')
comment_str = ''
yx_bsc_str = ''
yx_os_str = ''
yx_jietu_str = ''
yx_down_str = ''
for keyj, j in enumerate(yx_bsc):
lent1 = len(yx_bsc)
if lent1 - 1 != keyj:
yx_bsc_str = yx_bsc_str + j + ','
else:
yx_bsc_str = yx_bsc_str + j
for keyk, k in enumerate(yx_os):
lent2 = len(yx_os)
if lent2 - 1 != keyk:
yx_os_str = yx_os_str + k + ','
else:
yx_os_str = yx_os_str + k
for keyi, i in enumerate(yx_comment):
lent3 = len(yx_comment)
if lent3 - 1 != keyi:
comment_str = comment_str + i + '\n'
else:
comment_str = comment_str + i
for keyn, n in enumerate(yx_jietu):
lent4 = len(yx_jietu)
if lent4 - 1 != keyn:
yx_jietu_str = yx_jietu_str + n + ','
else:
yx_jietu_str = yx_jietu_str + n
for keym, m in enumerate(yx_down):
lent5 = len(yx_down)
if lent5 - 1 != keym:
yx_down_str = yx_down_str + m + ','
else:
yx_down_str = yx_down_str + m
mySql(yx_title, yx_bsc_str, yx_os_str, yx_type, comment_str, yx_jietu_str, yx_down_str, yx_head_portrait_url, i)
def check_url( result_title):
db = MySQLdb.connect("localhost", "root", "qinchaowei", "down_software", charset='utf8')
cursor = db.cursor()
sql = """SELECT yx_title FROM youxi WHERE yx_title = '""" + result_title + """'"""
cursor.execute(sql)
results = cursor.fetchall()
db.close()
return results
def pageinfo(num):
for page in range(int(num) + 1):
page_url = "https://www.34347.com/game/list-" + str(page) + "-0-0-0-0-0-0-0.html"
s = requests.get(page_url, data=data, headers=headers)
html = etree.HTML(s.text)
result_url = html.xpath('/html/body/div[1]/div[4]/div[2]/div[2]/ul/li[.]/h3/a/@href')
result_title = html.xpath('/html/body/div[1]/div[4]/div[2]/div[2]/ul/li[.]/h3/a/text()')
down_info(result_url)
data = {'some': 'data'}
headers = {'content-type': 'application/json',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'}
s = requests.get('https://www.34347.com/game/list-0-0-0-0-0-0-0-0.html', data=data, headers=headers)
html = etree.HTML(s.text)
result_title = html.xpath('/html/body/div[1]/div[4]/div[2]/div[2]/ul/li[.]/h3/a/text()')
result_url = html.xpath('/html/body/div[1]/div[4]/div[2]/div[2]/ul/li[.]/h3/a/@href')
yx_page = html.xpath('/html/body/div[1]/div[4]/div[2]/div[3]/nav/ul/li[.]/a/text()')
yx_page.pop()
num = yx_page[len(yx_page)-1]
pageinfo(num)
这便是爬取后的数据,存入到数据库中的
此代码仅供学习。
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)