Clawer-51job爬取
Published in:2020-09-27 |

简单的51job爬取五万条职位信息,正则匹配数据,以及json处理数据

Clawer-51job爬取

简单的51job爬取五万条职位信息

正则匹配数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import gzip,re
from urllib import request
num = 0
for pn in range(1, 1462):
url = "https://search.51job.com/list/000000,000000,0000,00,9,99,java,2,{}.html".format(pn)
headers = {
'user-agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20101101 Firefox/6.0",
'referer': 'url'}
# 'user-agent': "baiduSpider"}
req = request.Request(url)
res = request.urlopen(req).read()
try:
res = gzip.decompress(res).decode('gbk')
except:
res = res.decode('gbk')
jpb_name_rule = '"job_title":"(.*?)"'
jpb_conmoany_rule = '"company_name":"(.*?)"'
jpb_salary_rule = '"providesalary_text":"(.*?)"'
jpb_place_rule = '"workarea_text":"(.*?)"'

job_names = re.findall(jpb_name_rule,res)
job_company = re.findall(jpb_conmoany_rule,res)
job_salary = re.findall(jpb_salary_rule,res)
job_place = re.findall(jpb_place_rule,res)

with open('51job.txt', 'a', encoding='utf-8') as w:
for job_name in job_names:
index = job_name.index(job_name)
w.write(job_name + " " + job_company[index] + " " + job_salary[index]+ " " + job_place[index] + " " + "\n")
num += 1
if num == 50000:
break
print(num)

json处理数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import json
import re
from urllib import request

for i in range(1, 6):
url = "https://search.51job.com/list/010000,000000,0000,00,9,99,%25E9%2594%2580%25E5%2594%25AE,2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=".format(
i)
res = request.urlopen(url).read().decode('gbk')
# 因为数据存放在script中,json数据 使用json.loads解序列化;json.dumps序列化
rule = '__SEARCH_RESULT__ = (.*?)</script>'
job_dict = json.loads(re.findall(rule, res)[0])
for job in job_dict['engine_search_result']:
if not job['providesalary_text']:
job['providesalary_text'] = "面议"
print(job['job_name'], job['providesalary_text'])
Prev:
Clawer-爬虫采集免费的代理
Next:
User-Agent常见