爬取猫眼电影top100

使用ip代理爬取爬取猫眼电影top100

爬取猫眼电影top100

import re, gzip
from urllib import request
from lxml import etree

num = 0
with open("ip.txt", "r") as r:
    ips = r.readlines()
for pn in range(0, 100, 10):
    url = "https://maoyan.com/board/4?offset={}".format(pn)
    headers = {
        'user-agent': 'YoudaoBot',
        "referer": url,
    }
    req = request.Request(url, headers=headers)
    for ip in ips:
        # 构建handler
        handler = request.ProxyHandler(eval(ip))
        opener = request.build_opener(handler)
        res = opener.open(req).read().decode('utf-8')
        # 使用xpath匹配信息
        ele = etree.HTML(res)
        # 匹配电影的url
        movie_url_list = ele.xpath('//dl[@class="board-wrapper"]/dd/a/@href')
        # print(movie_url_list)
        # 拼接电影url
        new_movie_url = ["https://maoyan.com" + url for url in movie_url_list]
        # print(new_movie_url)
        for movie_url in new_movie_url:
            # print(movie_url)
            req1 = request.Request(movie_url, headers=headers)
            res1 = request.urlopen(req1).read()
            try:
                result1 = gzip.decompress(res1).decode('utf-8')
            except:
                result1 = res1.decode('utf-8')
            # print(result1)
            ele1 = etree.HTML(result1)
            movie_name = ele1.xpath('//h1/text()')[0]
            rule = '(.*?)\n(.*?)\n(.*?)\n(.*?)<li class="ellipsis">(.*?)<'
            movie_time = re.findall(rule, result1)[0][1][12:]
            movie_countries = re.findall(rule, result1)[0][0][8:]
            movie_grade = ele.xpath('//i[@class="integer"]/text()')[0] + ele.xpath('//i[@class="fraction"]/text()')[0]
            # print(movie_time,movie_countries,movie_grade)
            with open('movie_top100.txt', 'a', encoding='utf-8') as w:
                w.write(movie_name + " " + movie_countries + " " + movie_time + " " + movie_grade + " \n")
                num += 1
                print(num)

抓取免费高匿代理

{'http': '125.108.71.122:9000'}
{'http': '182.149.83.56:9999'}
{'http': '183.166.139.78:9999'}
{'http': '110.243.22.28:9999'}
{'http': '175.42.158.86:9999'}
{'http': '123.163.118.153:9999'}
{'http': '60.191.11.251:3128'}
{'http': '103.247.219.30:36295'}
{'http': '114.109.162.18:8080'}
{'http': '118.25.40.151:1080'}
{'http': '113.252.222.73:8380'}
{'http': '110.243.3.226:9999'}
{'http': '94.205.254.82:3128'}
{'http': '114.104.138.96:9999'}
{'http': '58.250.21.56:3128'}
{'http': '140.143.156.166:1080'}
{'http': '140.143.142.218:1080'}
{'http': '61.148.199.222:3128'}
{'http': '113.121.38.31:9999'}
{'http': '123.149.137.44:9999'}
{'http': '116.62.204.38:9999'}
{'http': '222.175.171.6:8080'}
{'http': '54.38.141.157:3128'}
{'http': '105.27.237.31:80'}
{'http': '54.38.63.140:3128'}
{'http': '165.225.32.114:10223'}
{'http': '51.83.231.21:3128'}
{'http': '165.225.32.113:10223'}
{'http': '54.38.63.141:3128'}
{'http': '165.225.32.107:13084'}
{'http': '54.38.141.159:3128'}
{'http': '51.77.61.153:3128'}
{'http': '165.225.84.146:8800'}
{'http': '51.83.231.86:3128'}
{'http': '51.83.233.109:3128'}
{'http': '51.83.231.87:3128'}
{'http': '46.101.140.93:3128'}
{'http': '165.225.32.118:10223'}
{'http': '115.53.34.149:9999'}
{'http': '54.38.141.157:3128'}
{'http': '105.27.237.31:80'}
{'http': '110.243.3.226:9999'}
{'http': '54.38.63.140:3128'}
{'http': '118.24.172.149:1080'}
{'http': '165.225.32.114:10223'}
{'http': '51.83.231.21:3128'}
{'http': '106.110.212.165:9999'}
{'http': '95.0.66.69:8080'}
{'http': '165.225.32.113:10223'}
{'http': '171.35.170.212:9999'}
{'http': '105.27.237.28:80'}
{'http': '165.225.32.107:13084'}
{'http': '51.83.232.95:3128'}
{'http': '54.38.141.159:3128'}
{'http': '115.218.214.35:9000'}
{'http': '51.77.61.153:3128'}
{'http': '165.225.84.146:8800'}
{'http': '51.83.231.86:3128'}
{'http': '51.83.233.109:3128'}
{'http': '51.83.231.23:3128'}
{'http': '46.101.140.93:3128'}
{'http': '165.225.32.118:10223'}
{'http': '165.225.84.148:8800'}
{'http': '54.38.51.134:3128'}
{'http': '165.225.32.106:10223'}

Hexo

爬取猫眼电影top100

你的赏识是我前进的动力