1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
| import requests import time import base64 import pymysql from lxml import etree from fontTools.ttLib import TTFont
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36' }
def get_font(page_url, page_num): response = requests.get(url=page_url, headers=headers) base64_string = response.text.split("base64,")[1].split("'")[0].strip() bin_data = base64.decodebytes(base64_string.encode()) with open('58font.woff', 'wb') as f: f.write(bin_data) print('第' + str(page_num) + '次访问网页,字体文件保存成功!') font = TTFont('58font.woff') font.saveXML('58font.xml') print('已成功将字体文件转换为xml文件!') return response.text
def find_font(): glyph_list = { 'glyph00001': '0', 'glyph00002': '1', 'glyph00003': '2', 'glyph00004': '3', 'glyph00005': '4', 'glyph00006': '5', 'glyph00007': '6', 'glyph00008': '7', 'glyph00009': '8', 'glyph00010': '9' } unicode_list = ['0x9476', '0x958f', '0x993c', '0x9a4b', '0x9e3a', '0x9ea3', '0x9f64', '0x9f92', '0x9fa4', '0x9fa5'] num_list = [] font_data = etree.parse('./58font.xml') for unicode in unicode_list: result = font_data.xpath("//cmap//map[@code='{}']/@name".format(unicode))[0] for key in glyph_list.keys(): if key == result: num_list.append(glyph_list[key]) print('已成功找到编码所对应的数字!') return num_list
def replace_font(num, page_response): result = page_response.replace('鑶', num[0]).replace('閏', num[1]).replace('餼', num[2]).replace( '驋', num[3]).replace('鸺', num[4]).replace('麣', num[5]).replace('齤', num[6]).replace( '龒', num[7]).replace('龤', num[8]).replace('龥', num[9]) print('已成功将所有加密字体替换!') return result
def parse_pages(pages): num = 0 ele = etree.HTML(pages) city = ele.xpath('//head/title/text()')[0].split('-')[1][:-4] title = ele.xpath('//h2/a/text()') rooms = ele.xpath('//div[@class="des"]/p[@class="room"]/text()') address = ele.xpath('//div[@class="des"]/p[@class="infor"]/a/text()') price = ele.xpath('//div[@class="money"]/b/text()') print(len(title)) for i in title: index = title.index(i) try: if title[index] == '': title[index] == '无' data = [title[index].split('\n')[1].strip(), rooms[index].replace(' ', ''), address[index * 2] + " " + address[index * 2 + 1], price[index] + '元/月', city] num += 1 except Exception as e: print(e) save_to_mysql(data) print('第' + str(num) + '条数据爬取完毕,暂停1.5秒!') time.sleep(2)
def create_mysql_table(): db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='spider') cursor = db.cursor() sql = 'CREATE TABLE IF NOT EXISTS 58tc_zufang (title VARCHAR(100) PRIMARY KEY,rooms VARCHAR(255) NOT NULL, address VARCHAR(100) ,price VARCHAR(255),city VARCHAR(255) )' cursor.execute(sql) db.close()
def save_to_mysql(data): db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='spider') cursor = db.cursor() sql = 'INSERT INTO 58tc_zufang(title,rooms,address,price,city) values(%s, %s, %s, %s,%s)' try: cursor.execute(sql, (data[0], data[1], data[2], data[3], data[4])) db.commit() except Exception as e: print(e) db.rollback() db.close()
if __name__ == '__main__': create_mysql_table() print('MySQL表58tc_data创建成功!') city_list = ['gz', 'zz', 'dg' ,'fs','sh','bj','nj','dl','tj','nb','cd','wx','hz','wh','sy','sz','xa','cq','cs','qd',] for city in city_list: for i in range(10, 25): url = ('https://{}.58.com/chuzu/pn' + str(i) + '/').format(city) print(url) response = get_font(url, i) num_list = find_font() pro_pages = replace_font(num_list, response) parse_pages(pro_pages) print('第' + str(i) + '页数据爬取完毕!') print('所有数据爬取完毕!')
|