爬虫代码（TJ）

2023-11-16

getip.py来自https://mp.csdn.net/postedit/99288836

import getip
import re
import cx_Oracle
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pymysql
# 获取若干需爬取的网页用来测试
testurl=[
    "https://www.tujia.com/gongyu/hangzhou/1/",
    "https://www.tujia.com/gongyu/hangzhou/2/"
]
testur2=[
    "https://www.tujia.com/detail/12690196.htm",
    "https://www.tujia.com/detail/11146003.htm"
]
thisapi = 'http://ip.11jsq.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=0&fa=0&fetch_key=&groupid=0&qty=1&time=100&pro=&city=&port=1&format=txt&ss=1&css=&dt=1&specialTxt=3&specialJson='
ip, ua = getip.check(0,thisapi,testurl)
conn = pymysql.connect(host='localhost', port=3306, user='root', passwd='******', db='hzdz', charset='utf8')
cur = conn.cursor()

# conn = cx_Oracle.connect('***', '***', 'localhost:1521/orcl')  # 连接数据库
# cur = conn.cursor()  # 获取cursor
count = 1671
for j in range(776,786 ):

    url = "https://www.tujia.com/gongyu/hangzhou/" + str(j) + '/'
    for i in range(0,2):
        try:
            ip, ua = getip.check(ip, thisapi,testurl)
            getip.install(ip, ua)

            data1 = urllib.request.urlopen(url).read()
            data = data1.decode("utf-8", "ignore")
            if(len(data)<3000):
                continue
            else:
                print("----当前IP有效--------")
                #print(type(data))
                pat = '<div class="label-tag">.*?<div class="noMmpty">.*?</div>.*?</div><a class="house-detail-link" href="(.*?)" target="_blank">'
                rst = re.compile(pat, re.S).findall(data)
                print(rst)
                for link in rst:
                    print(link)

                    for i in range(0, 2):
                        try:
                            ip, ua = getip.check(ip, thisapi, testurl)
                            getip.install(ip, ua)
                            chrome_options = Options()
                            chrome_options.add_argument('--headless')
                            chrome_options.add_argument('--disable-gpu')
                            driver = webdriver.Chrome(
                                executable_path=r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe',
                                chrome_options=chrome_options)
                            driver.get(link)
                            res = driver.page_source
                            #print(driver.page_source)
                            driver.close()
                           # print("len: %d" % len(res))
                            if (len(res) < 200000):
                                continue
                            else:
                                print("----内链接当前IP有效--------")

                                soup = BeautifulSoup(res, 'lxml')
                                name = soup.find_all('span', attrs={'class': 'title__name'})
                                price = soup.find_all('span', attrs={'class': 'price__count'})

                                   # print("pri%d"%pri)
                                #print(num)
                                #print(price[0].text)
                                add = soup.find('address', attrs={'class': 'unit-title__address'}).find_all('span')
                                chatName = soup.find('div', attrs={'class': 'unit-contact__land__main'}).find_all('a')
                                des = soup.find_all('div', attrs={'class': 'unit-description simple'})
                                if len(des) < 1 :
                                    des = soup.find_all('div', attrs={'class': 'unit-description'})
                                dess = des[0].text
                                dess = re.sub(r'\n', '', dess)

                                list_td = []
                                list_td.append(name[0].text)
                                list_td.append(add[0].text)
                                list_td.append(dess)
                                # print(name[0].text)
                                # print(add[0].text)
                                # print(dess)
                                temp_pri = ['9 ', '1', '2', '4', '6 ', '8', '1', '3', '5', '7']
                                num = ""
                                for pri in range(0, len(price[0].text)):
                                    index = int((price[0].text)[pri])

                                    num += temp_pri[index]
                                num = re.sub(r' ', '', num)
                                list_td.append(num)
                                price_finall = int(list_td[3])
                                # print(price_finall)
                                list_td.append(chatName[0].text)
                                list_td.append("途家网")
                                # print(chatName[0].text)

                                #print(list_td)
                                if list_td.__len__() > 4:
                                    try:

                                        cur.execute("USE hzdz")
                                        cur.execute('SET NAMES utf8')
                                        cur.execute('SET CHARACTER SET utf8')
                                        cur.execute('SET character_set_connection=utf8')
                                        #temp = "cast( '"+list_td[3] + "'as SIGNED INTEGER)"
                                        # test ="to_date('"+list_td[1]+" ','yyyy/mm/dd')"
                                        #print(type(temp))
                                        # print(type("0"+list_td[0]))
                                        # print(type("1"+list_td[1]))
                                        # print(type("2"+ list_td[2]))
                                        # print(type("3"+list_td[3]))
                                        # print(type("4"+ list_td[4]))
                                        # print(type("5"+list_td[5]))
                                     # in_sql = "insert into hzdzsj values ('" + list_td[0].strip() + "','" +  list_td[1].strip()+ "','" + list_td[2].strip() + "','" + list_td[3].strip() + "','" + list_td[4].strip() + "','" +list_td[5].strip()+ "')"
                                        #in_sql = "insert into hzdzsj values ('" + list_td[0] + "','" + list_td[1] + "','" + list_td[2] + "',to_number('" + list_td[3] + "') ,'" + list_td[4] + "','" + list_td[5] + "')"
                                        cur.execute(
                                            'insert into hzdzsj (标题,地址,房屋描述,租金,房东ID,网站) values(%s,%s,%s,%s,%s,%s)',
                                            [list_td[0], list_td[1], list_td[2],price_finall, list_td[4], list_td[5]])

                                        # print(type(pri))
                                        # cur.execute(
                                        #     'insert into hzdzsj (标题,地址,房屋描述,租金,房东ID,网站) values(%s,%s,%s,%s,%s,%s)',
                                        #     [name[0].text, add[0].text,dess,price_finall ,chatName[0].text,"途家网"])

                                        #print(in_sql)
                                        #cur.execute(in_sql)
                                        conn.commit()
                                        print("第%d" %(count) + "条数据插入成功")
                                        count = count + 1
                                    except Exception as e:
                                        print(e)
                                    finally:
                                        pass

                                break
                        except Exception as err:
                            print(err)
                            print("-----------内链接出现异常，准备重试-------------")
                # conn.commit()
                print("----------------第%d"%(j)+"页插入成功--------------")
                break
        except Exception as err:
            print(err)
            print("-----------出现异常，准备重试-------------")
cur.close()
conn.close()

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)

爬虫

爬虫代码（TJ）的相关文章

1、shell 基础进阶系列文章

shell 基础进阶系列文章目录第一章 shell本质第二章 shell2 第三章 shell3 第一章 shell认知 shell 基础进阶系列文章前言一 shell基础 1 shell脚本的本质 2 shell编译的基本步骤
zabbix监控数据转存与处理

zabbix是一个基于WEB页面的分布式系统监控方案能够监控各类资产并提供灵活的通知功能同时能够运行在各种流行系统中 zabbix分为zabbix server和zabbix agent端 zabbix server可以单独远程监控服务
美图全链路监控实战

一摘要本文内容分为3部分首先简单介绍了美图的业务背景和监控体系然后是两个美图的监控实践基于Grafana FlowCharting插件的监控大盘实战和基于基于GrafanaImageRender 企业微信机器人的图文告警实
JS 12——内置对象Math的方法

1 Math 是一个内置对象不是一个函数对象 2 Math 作为内置对象拥有一些数学常数属性和数学函数方法 3 如果Math 用于 Number 类型它不支持 BigInt 4 Math 的所有属性是静态的 Math方法也全部都是静态
在AndroidStudio中如何查看Gradle的版本

以Android Studio Giraffe 2022 3 1为例 File gt Project Structure gt Project Android Gradle Plugin Version Android Gradle插件版本

随机推荐

WEB程序员需要掌握的十大MySQL优化技巧

51CTO独家特稿 WEB开发者不光要解决程序的效率问题对数据库的快速访问和相应也是一个大问题希望本文能对大家掌握MySQL优化技巧有所帮助 1 优化你的MySQL查询缓存在MySQL服务器上进行查询可以启用高速查询缓存让数据库引
cookie，session，token区别

cookie session token区别前提用户登陆一次以后下次不会再输入密码核心的概念就是存储 cookie流程浏览器发起http请求服务器会进行cookie设置服务器会进行cookie设置也就是set cookie 服
用Python完成毫秒级抢单，助你秒杀淘宝大单

目录 0 引言 1 环境 2 需求分析前期准备 3 淘宝购物流程回顾 4 秒杀的实现 5 代码梳理 6 总结 0 引言年中购物618大狂欢又要开始了各大电商又开始了大力度的折扣促销如何做到更省钱的剁手呢今天给大家提供一种思路用P
java宏定义三目运算define_宏定义的正确写法，三目运算的宏定义

第一阶段对象宏 define M PI 3 14159265358979323846264338327950288 函数宏 define PLUS x y x y 正确的认识宏三目运算的宏定义 1 小白写法 define MIN A B
非线性控制4——Back Stepping

1 基本思想 2 重要定理 3 实例仿真单机械臂稳定控制 3 1 模型建立以单机械臂控制为例具有参数不确定性的单机械臂的模型如式 3 1 3 1 式中为机械臂的位置为速度为加速度为电动机给出的驱动力矩为控制信号输入为机械臂
私域流量对比：微信公众号、小程序、APP，谁更有价值？

在数字化时代流量已经成为了互联网企业最重要的资源之一而对于企业来说获取到流量只是第一步如何将流量转化为价值才是最终目的对于私域流量的获取和转化微信公众号小程序和APP是目前最常见的三种方式那么这三种私域流量各有什么优缺点呢
c++11 lambda表达式

lambda 表达式使用一对方括号作为开始的标识类似于声明一个函数只不过这个函数没有名字也就是一个匿名函数其返回值是自动推断的函数体足够简单的情况当然也可以指定返回值类型 c 11 lambda语法形式 p int a gt i
mmdetection的下载与安装（附带跑solov2示例）

一找到官方文档按步骤安装 mmdetection中包含许多模型的检测框架下载以后方便后续调用官方文档地址依赖 MMDetection 2 27 0 文档需要注意的点按照步骤来基本没有什么问题注意CUDA torch mmcv的
centos7 linux定时任务详解

前言工作中需要开启一个定时任务每天晚上2点进行爬虫代码的运行这不得不去学习一下linux 下的定时任务crontab crontab yum install crontabs 说明 sbin service crond start 启
idea 部署项目到tomcat

转 http www biliyu com article 986 html 这篇文章的后半部分解决了困扰我一上午的问题因为遇到问题不喜欢问别人然后搜到这篇文章我在百度输入的是 warning no artifacts marked
如何解决Ubuntu 下gstm不能打开图形用户界面的问题

gstm是一款Linux下的SSH管理工具至于gstm的主要作用来这里看此文的人都懂的用此工具一段时间后某天打开时突然出现不能打开图形界面的问题一阵焦虑卸载重装n遍后仍不能解决查找半天资料突然在一小论坛某位网友的回答中解决了
如何在 Mapbox GL JS 中加载任意投影的图片？

在 Mapbox GL JS API 中我们可以使用 image source 和 raster layer 将图片叠加到地图上例如 Add a raster image to a map layer 但是因为 Mapbox 使用网
Ubuntu 代理上网设置(firefox，新立得，apt-get等)

现在公司需要代理上网 ubuntu又是那么的依懒网络前几天在公司装了ubuntu就开始查资料设置代理上网以下整合一下部分是参考其他网友的这里就不一一说明了一 Firefox代理上网这个最简单了依次点击 edit gt pref
git 本地仓库关联到远程仓库

将本地仓库关联到远程仓库方式一远程仓库没有文件第一步 git init 初始化git仓库第二步 git remote add 地址设置remote地址第三步 git add 将所有变更提交到本地仓库第四步 git commit
SpringBoot激活profiles的几种方式

多环境是最常见的配置隔离方式之一可以根据不同的运行环境提供不同的配置信息来应对不同的业务场景在SpringBoot内支持了多种配置隔离的方式可以激活单个或者多个配置文件激活的profiles要在项目内创建对应的配置文件格式为app
LearnOpenGL - 绘制三角形完整代码

include
Linux下安装Python3.9（orangepi Zero2）

1 查看当前Linux下自带的Python版本 python version 2 更新Linux源 sudo apt update 3 安装Python所需要的环境代码如下通用代码树莓派全志 Linux均适用 sudo apt in
Octave常用函数

矩阵生成 eye n 生成n行n列的单位矩阵 rand n m 随机生成n行m列大小范围在0 1之间的随机数 randn n m 按高斯分布生成n行m列的随机数 ones n m 生成n行m列元素均为1的矩阵 zeros n m 生成n行m
kaggle：泰坦尼克生存预测（ R语言机器学习分类算法）

本文在基本的多元统计分析技术理论基础上结合机器学习基本模型选择Kaggle 数据建模竞赛网站的入门赛 Titanic生存预测作为实战演练较为完整地呈现了数据建模的基本流程和思路采用的模型有逻辑回归决策树 SVM支持向量机以及进阶
爬虫代码（TJ）

getip py来自https mp csdn net postedit 99288836 import getip import re import cx Oracle import urllib request from bs4 imp

爬虫代码（TJ）

爬虫代码（TJ） 的相关文章

随机推荐

热门标签

爬虫代码（TJ）的相关文章