getip.py来自https://mp.csdn.net/postedit/99288836
import getip
import re
import cx_Oracle
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pymysql
# 获取若干需爬取的网页用来测试
testurl=[
"https://www.tujia.com/gongyu/hangzhou/1/",
"https://www.tujia.com/gongyu/hangzhou/2/"
]
testur2=[
"https://www.tujia.com/detail/12690196.htm",
"https://www.tujia.com/detail/11146003.htm"
]
thisapi = 'http://ip.11jsq.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=0&fa=0&fetch_key=&groupid=0&qty=1&time=100&pro=&city=&port=1&format=txt&ss=1&css=&dt=1&specialTxt=3&specialJson='
ip, ua = getip.check(0,thisapi,testurl)
conn = pymysql.connect(host='localhost', port=3306, user='root', passwd='******', db='hzdz', charset='utf8')
cur = conn.cursor()
# conn = cx_Oracle.connect('***', '***', 'localhost:1521/orcl') # 连接数据库
# cur = conn.cursor() # 获取cursor
count = 1671
for j in range(776,786 ):
url = "https://www.tujia.com/gongyu/hangzhou/" + str(j) + '/'
for i in range(0,2):
try:
ip, ua = getip.check(ip, thisapi,testurl)
getip.install(ip, ua)
data1 = urllib.request.urlopen(url).read()
data = data1.decode("utf-8", "ignore")
if(len(data)<3000):
continue
else:
print("----当前IP有效--------")
#print(type(data))
pat = '<div class="label-tag">.*?<div class="noMmpty">.*?</div>.*?</div><a class="house-detail-link" href="(.*?)" target="_blank">'
rst = re.compile(pat, re.S).findall(data)
print(rst)
for link in rst:
print(link)
for i in range(0, 2):
try:
ip, ua = getip.check(ip, thisapi, testurl)
getip.install(ip, ua)
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(
executable_path=r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe',
chrome_options=chrome_options)
driver.get(link)
res = driver.page_source
#print(driver.page_source)
driver.close()
# print("len: %d" % len(res))
if (len(res) < 200000):
continue
else:
print("----内链接当前IP有效--------")
soup = BeautifulSoup(res, 'lxml')
name = soup.find_all('span', attrs={'class': 'title__name'})
price = soup.find_all('span', attrs={'class': 'price__count'})
# print("pri%d"%pri)
#print(num)
#print(price[0].text)
add = soup.find('address', attrs={'class': 'unit-title__address'}).find_all('span')
chatName = soup.find('div', attrs={'class': 'unit-contact__land__main'}).find_all('a')
des = soup.find_all('div', attrs={'class': 'unit-description simple'})
if len(des) < 1 :
des = soup.find_all('div', attrs={'class': 'unit-description'})
dess = des[0].text
dess = re.sub(r'\n', '', dess)
list_td = []
list_td.append(name[0].text)
list_td.append(add[0].text)
list_td.append(dess)
# print(name[0].text)
# print(add[0].text)
# print(dess)
temp_pri = ['9 ', '1', '2', '4', '6 ', '8', '1', '3', '5', '7']
num = ""
for pri in range(0, len(price[0].text)):
index = int((price[0].text)[pri])
num += temp_pri[index]
num = re.sub(r' ', '', num)
list_td.append(num)
price_finall = int(list_td[3])
# print(price_finall)
list_td.append(chatName[0].text)
list_td.append("途家网")
# print(chatName[0].text)
#print(list_td)
if list_td.__len__() > 4:
try:
cur.execute("USE hzdz")
cur.execute('SET NAMES utf8')
cur.execute('SET CHARACTER SET utf8')
cur.execute('SET character_set_connection=utf8')
#temp = "cast( '"+list_td[3] + "'as SIGNED INTEGER)"
# test ="to_date('"+list_td[1]+" ','yyyy/mm/dd')"
#print(type(temp))
# print(type("0"+list_td[0]))
# print(type("1"+list_td[1]))
# print(type("2"+ list_td[2]))
# print(type("3"+list_td[3]))
# print(type("4"+ list_td[4]))
# print(type("5"+list_td[5]))
# in_sql = "insert into hzdzsj values ('" + list_td[0].strip() + "','" + list_td[1].strip()+ "','" + list_td[2].strip() + "','" + list_td[3].strip() + "','" + list_td[4].strip() + "','" +list_td[5].strip()+ "')"
#in_sql = "insert into hzdzsj values ('" + list_td[0] + "','" + list_td[1] + "','" + list_td[2] + "',to_number('" + list_td[3] + "') ,'" + list_td[4] + "','" + list_td[5] + "')"
cur.execute(
'insert into hzdzsj (标题,地址,房屋描述,租金,房东ID,网站) values(%s,%s,%s,%s,%s,%s)',
[list_td[0], list_td[1], list_td[2],price_finall, list_td[4], list_td[5]])
# print(type(pri))
# cur.execute(
# 'insert into hzdzsj (标题,地址,房屋描述,租金,房东ID,网站) values(%s,%s,%s,%s,%s,%s)',
# [name[0].text, add[0].text,dess,price_finall ,chatName[0].text,"途家网"])
#print(in_sql)
#cur.execute(in_sql)
conn.commit()
print("第%d" %(count) + "条数据插入成功")
count = count + 1
except Exception as e:
print(e)
finally:
pass
break
except Exception as err:
print(err)
print("-----------内链接出现异常,准备重试-------------")
# conn.commit()
print("----------------第%d"%(j)+"页插入成功--------------")
break
except Exception as err:
print(err)
print("-----------出现异常,准备重试-------------")
cur.close()
conn.close()