Clawer-爬虫采集免费的代理,并测试代理是否可用
Clawer-爬虫采集免费的代理
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
|
import re from urllib import request from lxml import etree
test_url = "http://www.httpbin.org/ip" local_ip = request.urlopen(test_url).read().decode()
url = "https://ip.jiangxianli.com/blog.html" res = request.urlopen(url).read().decode('utf-8')
ele = etree.HTML(res) char_url_list = ele.xpath('//h3/a/@href') for char_url in char_url_list: res1 = request.urlopen(char_url).read().decode('utf-8') rule = '<p>(.*?)</p>' ip_port_list = re.findall(rule, res) for i in ip_port_list[1:]: ip_port = i.split('@HTTP')[0] ip_port1 = ip_port.lstrip() dict1 = {} dict1['http'] = ip_port1 print("开始测试代理{}".format(dict1)) proxy_handler = request.ProxyHandler(dict1) opener = request.build_opener(proxy_handler) try: now_ip = opener.open(test_url, timeout=4).read().decode() if now_ip != local_ip: print("代理可以用:{}".format(dict1)) with open("ip.txt", "a") as w: w.write(str(dict1) + "\n") print("写入ip{}".format(dict1)) except: pass
|
测试代理
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
| from urllib import request
test_url = "http://www.httpbin.org/ip" local_ip = request.urlopen(test_url).read().decode()
with open('ip.txt', 'r') as r: ips = r.readlines() for ip in ips: proxy_handler = request.ProxyHandler(eval(ip)) opener = request.build_opener(proxy_handler) try: now_ip = opener.open(test_url, timeout=4).read().decode() if now_ip != local_ip: print("代理可以用{}".format(ip)) with open('ip1.txt', 'a') as w: w.write(str(ip) + "\n") except: pass
|