文章使用的代理ip链接为:
云代理: http://www.ip3366.net/?stype=1&page=1
验证IP网站为: http://'http://httpbin.org/ip
get方式请求进去,要设置好请求头,cookie
self.faker = Faker(locale='zh_CN')
self.headers = {
'User-Agent': self.faker.chrome(),
'Cookie':'Hm_lvt_c4dd741ab3585e047d56cf99ebbbe102=1667027823,1668068804; Hm_lpvt_c4dd741ab3585e047d56cf99ebbbe102=1668068825',
'Host':'www.ip3366.net',
}
这里的UA我是使用faker伪造的,当然这也没什么,只是为了让浏览器知道你是一个用户
请求成功后,用xpath来获取ip地址和端口号
将获取到的IP地址和端口号放到列表中
num = int(input('输入爬取页数: '))
self.ip_list = []
for page in range(num):
print(f"==============================正在爬取第{page+1}页==============================")
self.url = f'http://www.ip3366.net/?stype=1&page={page+1}'
reqs = requests.get(self.url, headers=self.header)
reqs.encoding = 'gb2312'
selecotors = Selector(reqs.text)
tr_lists = selecotors.xpath('//div[@id="container"]/div[@id="list"]/table[@class="table table-bordered table-striped"]/tbody/tr')
item = {}
for tr_list in tr_lists:
item['ip_dz'] = tr_list.xpath('./td[1]/text()').get()
item['prots'] = tr_list.xpath('./td[2]/text()').get()
# item['type_s'] = tr_list.xpath('./td[4]/text()').get()
self.proxys = {
'http': item['ip_dz']+ ':'+item['prots'],
'https': item['ip_dz']+ ':'+item['prots']
}
print(self.proxys)
self.ip_list.append(self.proxys)
进行IP验证,设置超时时间为6,超过为不可用,找到可用IP
can_ip = []
cant_ip = []
for ip in self.ip_list:
try:
req = requests.get(url='http://httpbin.org/ip', headers=self.headers, proxies=ip, timeout=6)
# print(req.json())
req_json = req.json()
req_ip = req_json.get('origin')
print('这是origin: ',req_ip)
http_ip_prot = ip.get('http')
http_ip = re.search(r'(.*?):(\d+)', http_ip_prot, re.S).group(1)
print('这是http_ip: ', http_ip)
try:
if req.status_code == 200 and http_ip==req_ip:
can_ip.append(ip)
print(ip, '可用')
print(req.json())
else:
cant_ip.append(ip)
print(ip, '不可用')
except:
cant_ip.append(ip)
print(ip, '不可用')
except:
cant_ip.append(ip)
print(ip, '不可用')
将可用IP保存为csv文件
with open('IP_use.csv', 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerow(can_ip)
这里展示全部代码:
import requests
# import time
from faker import Faker
from parsel import Selector
import csv
import re
class IPDL():
def __init__(self):
self.faker = Faker(locale='zh_CN')
self.header = {
'User-Agent': self.faker.chrome(),
'Cookie':'Hm_lvt_c4dd741ab3585e047d56cf99ebbbe102=1674180209; Hm_lpvt_c4dd741ab3585e047d56cf99ebbbe102=1674184681',
'Host':'www.ip3366.net',
}
def get_html(self):
num = int(input('输入爬取页数: '))
self.ip_list = []
for page in range(num):
print(f"==============================正在爬取第{page+1}页==============================")
self.url = f'http://www.ip3366.net/?stype=1&page={page+1}'
reqs = requests.get(self.url, headers=self.header)
reqs.encoding = 'gb2312'
selecotors = Selector(reqs.text)
tr_lists = selecotors.xpath('//div[@id="container"]/div[@id="list"]/table[@class="table table-bordered table-striped"]/tbody/tr')
item = {}
for tr_list in tr_lists:
item['ip_dz'] = tr_list.xpath('./td[1]/text()').get()
item['prots'] = tr_list.xpath('./td[2]/text()').get()
# item['type_s'] = tr_list.xpath('./td[4]/text()').get()
self.proxys = {
'http': item['ip_dz']+ ':'+item['prots'],
'https': item['ip_dz']+ ':'+item['prots']
}
print(self.proxys)
self.ip_list.append(self.proxys)
def train_ip(self):
"""检测ip的函数"""
self.headers = {
'User-Agent': self.faker.chrome(),
}
can_ip = []
cant_ip = []
for ip in self.ip_list:
try:
req = requests.get(url='http://httpbin.org/ip', headers=self.headers, proxies=ip, timeout=6)
# print(req.json())
req_json = req.json()
req_ip = req_json.get('origin')
print('这是origin: ',req_ip)
http_ip_prot = ip.get('http')
http_ip = re.search(r'(.*?):(\d+)', http_ip_prot, re.S).group(1)
print('这是http_ip: ', http_ip)
try:
if req.status_code == 200 and http_ip==req_ip:
can_ip.append(ip)
print(ip, '可用')
print(req.json())
else:
cant_ip.append(ip)
print(ip, '不可用')
except:
cant_ip.append(ip)
print(ip, '不可用')
except:
cant_ip.append(ip)
print(ip, '不可用')
with open('IP_use.csv', 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerow(can_ip)
print('可用ip共:',len(can_ip))
print('不可用ip共:',len(cant_ip))
def mains(self):
self.get_html()
# self.save()
self.train_ip()
if __name__ == '__main__':
IPDL().mains()