58同城一线城市房源信息
Published in:2020-10-13 |

58同城一线城市房源信息爬取,并存放数据库

58同城一线城市租房房源信息

需要处理58同城的加密字体,买房无加密.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import requests
import time
import base64
import pymysql
from lxml import etree
from fontTools.ttLib import TTFont

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}

# 获取字体文件并转换为xml文件
def get_font(page_url, page_num):
response = requests.get(url=page_url, headers=headers)
# 匹配 base64 编码的加密字体字符串
base64_string = response.text.split("base64,")[1].split("'")[0].strip()
# print(base64_string)
# 将 base64 编码的字体字符串解码成二进制编码
bin_data = base64.decodebytes(base64_string.encode())
# 保存为字体文件
with open('58font.woff', 'wb') as f:
f.write(bin_data)
print('第' + str(page_num) + '次访问网页,字体文件保存成功!')
# 获取字体文件,将其转换为xml文件
font = TTFont('58font.woff')
font.saveXML('58font.xml')
print('已成功将字体文件转换为xml文件!')
return response.text


# 将加密字体编码与真实字体进行匹配
def find_font():
# 以glyph开头的编码对应的数字
glyph_list = {
'glyph00001': '0',
'glyph00002': '1',
'glyph00003': '2',
'glyph00004': '3',
'glyph00005': '4',
'glyph00006': '5',
'glyph00007': '6',
'glyph00008': '7',
'glyph00009': '8',
'glyph00010': '9'
}
# 十个加密字体编码
unicode_list = ['0x9476', '0x958f', '0x993c', '0x9a4b', '0x9e3a', '0x9ea3', '0x9f64', '0x9f92', '0x9fa4', '0x9fa5']
num_list = []
# 利用xpath语法匹配xml文件内容
font_data = etree.parse('./58font.xml')
for unicode in unicode_list:
# 依次循环查找xml文件里code对应的name
result = font_data.xpath("//cmap//map[@code='{}']/@name".format(unicode))[0]
# print(result)
# 循环字典的key,如果code对应的name与字典的key相同,则得到key对应的value
for key in glyph_list.keys():
if key == result:
num_list.append(glyph_list[key])
print('已成功找到编码所对应的数字!')
# print(num_list)
# 返回value列表
return num_list


# 替换掉网页中所有的加密字体编码
def replace_font(num, page_response):
# 9476 958F 993C 9A4B 9E3A 9EA3 9F64 9F92 9FA4 9FA5
result = page_response.replace('鑶', num[0]).replace('閏', num[1]).replace('餼', num[2]).replace(
'驋', num[3]).replace('鸺', num[4]).replace('麣', num[5]).replace('齤', num[6]).replace(
'龒', num[7]).replace('龤', num[8]).replace('龥', num[9])
print('已成功将所有加密字体替换!')
return result


# 提取租房信息
def parse_pages(pages):
num = 0
ele = etree.HTML(pages)
city = ele.xpath('//head/title/text()')[0].split('-')[1][:-4]
title = ele.xpath('//h2/a/text()')
rooms = ele.xpath('//div[@class="des"]/p[@class="room"]/text()')
address = ele.xpath('//div[@class="des"]/p[@class="infor"]/a/text()')
price = ele.xpath('//div[@class="money"]/b/text()')
print(len(title))
for i in title:
index = title.index(i)
try:
if title[index] == '':
title[index] == '无'
data = [title[index].split('\n')[1].strip(), rooms[index].replace(' ', ''),
address[index * 2] + " " + address[index * 2 + 1], price[index] + '元/月', city]
num += 1
except Exception as e:
print(e)
save_to_mysql(data)
print('第' + str(num) + '条数据爬取完毕,暂停1.5秒!')
time.sleep(2)


# 创建MySQL数据库的表:58tc_data
def create_mysql_table():
db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='spider')
cursor = db.cursor()
sql = 'CREATE TABLE IF NOT EXISTS 58tc_zufang (title VARCHAR(100) PRIMARY KEY,rooms VARCHAR(255) NOT NULL, address VARCHAR(100) ,price VARCHAR(255),city VARCHAR(255) )'
cursor.execute(sql)
db.close()


# 将数据储存到MySQL数据库
def save_to_mysql(data):
db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='spider')
cursor = db.cursor()
sql = 'INSERT INTO 58tc_zufang(title,rooms,address,price,city) values(%s, %s, %s, %s,%s)'
try:
cursor.execute(sql, (data[0], data[1], data[2], data[3], data[4]))
db.commit()
except Exception as e:
print(e)
db.rollback()
db.close()


if __name__ == '__main__':
create_mysql_table()
print('MySQL表58tc_data创建成功!')
city_list = ['gz', 'zz', 'dg' ,'fs','sh','bj','nj','dl','tj','nb','cd','wx','hz','wh','sy','sz','xa','cq','cs','qd',]
for city in city_list:
for i in range(10, 25):
url = ('https://{}.58.com/chuzu/pn' + str(i) + '/').format(city)
print(url)
response = get_font(url, i)
num_list = find_font()
pro_pages = replace_font(num_list, response)
parse_pages(pro_pages)
print('第' + str(i) + '页数据爬取完毕!')
# time.sleep(random.randint(3, 10))
print('所有数据爬取完毕!')

买房(二手房)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# _*_coding:UTF-8 _*_
import requests
import time
import pymysql
from lxml import etree

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}

# 提取租房信息
def parse_pages(pages):
num = 0
ele = etree.HTML(pages)
try:
city = ele.xpath('//head/title/text()')[0].split('-')[1][:-4]
title = ele.xpath('//h2/a/text()')
print(len(title))
sum_price = ele.xpath("//div[@class='price']/p[@class='sum']/b/text()")
evey_price = ele.xpath("//div[@class='price']/p[@class='unit']/text()")
except:
time.sleep(60 * 5)
print("出现验证码,5分钟后操作")
city = ele.xpath('//head/title/text()')[0].split('-')[1][:-4]
title = ele.xpath('//h2/a/text()')
print(len(title))
sum_price = ele.xpath("//div[@class='price']/p[@class='sum']/b/text()")
evey_price = ele.xpath("//div[@class='price']/p[@class='unit']/text()")
for i in title:
index = title.index(i)
try:
data = [title[index].split('\/xa0')[0], sum_price[index] + '万', evey_price[index], city]
num += 1
save_to_mysql(data)
print('第' + str(num) + '条数据爬取完毕,暂停1.5秒!')
time.sleep(1.5)
except Exception as e:
print(e)


# 创建MySQL数据库的表:58tc_data
def create_mysql_table():
db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='spider')
cursor = db.cursor()
sql = 'CREATE TABLE IF NOT EXISTS 58tc_xinfang (title VARCHAR(100) PRIMARY KEY,sum_price VARCHAR(255), every_price VARCHAR(100) ,city VARCHAR(255) )'
cursor.execute(sql)
db.close()


# 将数据储存到MySQL数据库
def save_to_mysql(data):
db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='spider')
cursor = db.cursor()
sql = 'INSERT INTO 58tc_xinfang(title,sum_price,every_price,city) values(%s, %s, %s, %s)'
try:
cursor.execute(sql, (data[0], data[1], data[2], data[3]))
db.commit()
except Exception as e:
print(e)
db.rollback()
db.close()


if __name__ == '__main__':
# create_mysql_table()
# print('MySQL表58tc_xifang创建成功!')
city_list = ['fs', 'gz', 'zz', 'dg', 'sh', 'bj', 'nj', 'dl', 'tj', 'nb', 'cd', 'wx', 'hz', 'wh', 'sy', 'sz', 'xa',
'cq', 'cs', 'qd']
for city in city_list:
for i in range(1, 20):
url = ('https://{}.58.com/ershoufang/pn' + str(i) + '/').format(city)
print(url)
pro_pages = requests.get(url).text
parse_pages(pro_pages)
print('第' + str(i) + '页数据爬取完毕!')
# time.sleep(random.randint(3, 10))
print('所有数据爬取完毕!')
Prev:
58同城一线城市买房信息数据可视化
Next:
进程池和线程池爬取51job