爬取猫眼电影top100
Published in:2020-09-29 |

使用ip代理爬取爬取猫眼电影top100

爬取猫眼电影top100

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import re, gzip
from urllib import request
from lxml import etree

num = 0
with open("ip.txt", "r") as r:
ips = r.readlines()
for pn in range(0, 100, 10):
url = "https://maoyan.com/board/4?offset={}".format(pn)
headers = {
'user-agent': 'YoudaoBot',
"referer": url,
}
req = request.Request(url, headers=headers)
for ip in ips:
# 构建handler
handler = request.ProxyHandler(eval(ip))
opener = request.build_opener(handler)
res = opener.open(req).read().decode('utf-8')
# 使用xpath匹配信息
ele = etree.HTML(res)
# 匹配电影的url
movie_url_list = ele.xpath('//dl[@class="board-wrapper"]/dd/a/@href')
# print(movie_url_list)
# 拼接电影url
new_movie_url = ["https://maoyan.com" + url for url in movie_url_list]
# print(new_movie_url)
for movie_url in new_movie_url:
# print(movie_url)
req1 = request.Request(movie_url, headers=headers)
res1 = request.urlopen(req1).read()
try:
result1 = gzip.decompress(res1).decode('utf-8')
except:
result1 = res1.decode('utf-8')
# print(result1)
ele1 = etree.HTML(result1)
movie_name = ele1.xpath('//h1/text()')[0]
rule = '(.*?)\n(.*?)\n(.*?)\n(.*?)<li class="ellipsis">(.*?)<'
movie_time = re.findall(rule, result1)[0][1][12:]
movie_countries = re.findall(rule, result1)[0][0][8:]
movie_grade = ele.xpath('//i[@class="integer"]/text()')[0] + ele.xpath('//i[@class="fraction"]/text()')[0]
# print(movie_time,movie_countries,movie_grade)
with open('movie_top100.txt', 'a', encoding='utf-8') as w:
w.write(movie_name + " " + movie_countries + " " + movie_time + " " + movie_grade + " \n")
num += 1
print(num)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
抓取免费高匿代理

{'http': '125.108.71.122:9000'}
{'http': '182.149.83.56:9999'}
{'http': '183.166.139.78:9999'}
{'http': '110.243.22.28:9999'}
{'http': '175.42.158.86:9999'}
{'http': '123.163.118.153:9999'}
{'http': '60.191.11.251:3128'}
{'http': '103.247.219.30:36295'}
{'http': '114.109.162.18:8080'}
{'http': '118.25.40.151:1080'}
{'http': '113.252.222.73:8380'}
{'http': '110.243.3.226:9999'}
{'http': '94.205.254.82:3128'}
{'http': '114.104.138.96:9999'}
{'http': '58.250.21.56:3128'}
{'http': '140.143.156.166:1080'}
{'http': '140.143.142.218:1080'}
{'http': '61.148.199.222:3128'}
{'http': '113.121.38.31:9999'}
{'http': '123.149.137.44:9999'}
{'http': '116.62.204.38:9999'}
{'http': '222.175.171.6:8080'}
{'http': '54.38.141.157:3128'}
{'http': '105.27.237.31:80'}
{'http': '54.38.63.140:3128'}
{'http': '165.225.32.114:10223'}
{'http': '51.83.231.21:3128'}
{'http': '165.225.32.113:10223'}
{'http': '54.38.63.141:3128'}
{'http': '165.225.32.107:13084'}
{'http': '54.38.141.159:3128'}
{'http': '51.77.61.153:3128'}
{'http': '165.225.84.146:8800'}
{'http': '51.83.231.86:3128'}
{'http': '51.83.233.109:3128'}
{'http': '51.83.231.87:3128'}
{'http': '46.101.140.93:3128'}
{'http': '165.225.32.118:10223'}
{'http': '115.53.34.149:9999'}
{'http': '54.38.141.157:3128'}
{'http': '105.27.237.31:80'}
{'http': '110.243.3.226:9999'}
{'http': '54.38.63.140:3128'}
{'http': '118.24.172.149:1080'}
{'http': '165.225.32.114:10223'}
{'http': '51.83.231.21:3128'}
{'http': '106.110.212.165:9999'}
{'http': '95.0.66.69:8080'}
{'http': '165.225.32.113:10223'}
{'http': '171.35.170.212:9999'}
{'http': '105.27.237.28:80'}
{'http': '165.225.32.107:13084'}
{'http': '51.83.232.95:3128'}
{'http': '54.38.141.159:3128'}
{'http': '115.218.214.35:9000'}
{'http': '51.77.61.153:3128'}
{'http': '165.225.84.146:8800'}
{'http': '51.83.231.86:3128'}
{'http': '51.83.233.109:3128'}
{'http': '51.83.231.23:3128'}
{'http': '46.101.140.93:3128'}
{'http': '165.225.32.118:10223'}
{'http': '165.225.84.148:8800'}
{'http': '54.38.51.134:3128'}
{'http': '165.225.32.106:10223'}

Prev:
自定义实现迭代器
Next:
Clawer-爬虫采集免费的代理