1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
| from multiprocessing import Pool import requests import json import re
def run(page): print("开始爬取") for i in range(1, page): url = "https://search.51job.com/list/010000,000000,0000,00,9,99,%25E9%2594%2580%25E5%2594%25AE,2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=".format( i) res = requests.get(url, headers={'user-agent': "baiduspider"}).text rule = '__SEARCH_RESULT__ = (.*?)</script>' job_dict = json.loads(re.findall(rule, res)[0]) for job in job_dict['engine_search_result']: if not job['providesalary_text']: job['providesalary_text'] = "面议" print(job['job_name'], job['providesalary_text'])
if __name__ == '__main__': pool = Pool(10) for i in range(100): pool.apply_async(run, (i,)) pool.close() pool.join() print("爬取结束")
|