jumper2014 · zhenghuilu · Jun 20, 2024 · Jun 21, 2024 · Jun 28, 2024 · Jun 28, 2024
diff --git a/README.md b/README.md
@@ -21,6 +21,7 @@
 - Q: 如何减少并发的爬虫数？ A: 见见base_spider.py的thread_pool_size
 - Q: 为何无法使用xiaoqu_to_chart.py? A: 该脚本现仅支持mac系统
 - Q: 有其他问题反馈途径么？ A: 问题反馈QQ群号635276285。
+- Q: 如何把数据导入到mysql中 A: 运行 “ershou.py”生成文件，再运行“ershou_to_db.py”导入到mysql
 
 ## 小区房价数据爬取
 - 内容格式：采集日期,所属区县,板块名,小区名,挂牌均价,挂牌数
@@ -45,6 +46,12 @@ nj: 南京,
 - python xiaoqu_to_db.py 根据提示将今天采集到的csv数据存入数据库。(默认导出为单一csv文件)
 - python xiaoqu_to_chart.py 将单一csv文件数据通过图表展示。
 
+## 公寓二手房与租房数据爬取
+- ershou_公寓.py 用于爬取公寓二手房的数据
+- zufang_整租1室2室.py 用于爬取公寓租房数据
+- 公寓数据抓取.py 用于爬取公寓的二手房与公寓的租房数据
+
+
 ## 挂牌二手房数据爬取
 - 获取链家网挂牌二手房价数据，数据格式如下：
 - 20180405,浦东,万祥镇,祥安菊苑 3室2厅 258万,258万,祥安菊苑  | 3室2厅 | 126.58平米 | 南 | 毛坯
@@ -71,6 +78,33 @@ nj: 南京,
 - 出租房房价数据存储目录为 data/site/zufang/city/date
 - 新房房价数据存储目录为 data/site/loupan/city/date
 
+## 数据导入数据库
+- 建表语句：lianjia_xiaoqu.sql
+- 运行对应 ershou_to_db.py 把二手房数据从csv文件导入到数据库中
+- 运行对应 zufang_to_db.py 把租房数据从csv文件导入到数据库中
+
+## 租售比报表
+```sql
+-- 杭州公寓租售比数据查询sql
+select a.district,a.area, a.xiaoqu, 
+a.building_space as 建筑面积, 
+b.min_price*12/(a.min_total_price*10000) as 租售比, 
+b.avg_price*12/(a.min_total_price*10000) as 平均租金租售比,
+a.min_total_price as 最低总价,
+b.min_price as 最低租金,
+b.avg_price as 平均租金
+from
+(select district,area, xiaoqu, building_space, min(total_price) as min_total_price from ershou
+where date = 20240621
+group by district,area,xiaoqu, building_space) a inner join 
+(select district, area, xiaoqu, building_space, min(price) as min_price, avg(price) as avg_price from zufang
+where date = 20240621
+group by district, area, xiaoqu, building_space) b on a.district=b.district and a.area=b.area and a.xiaoqu=b.xiaoqu 
+and a.building_space=b.building_space
+-- order by 租售比 desc
+order by 平均租金租售比 desc
+```
+
 ## 性能
 - 300秒爬取上海市207个版块的2.7万条小区数据，平均每秒90条数据。
 ```
@@ -96,6 +130,9 @@ Total cost 29.757128953933716 second
 
 
 ### 更新记录
+- 2024/06/22 增加了专门爬公寓的数据的功能
+- 2024/06/21 修复二手房爬虫的几个小bug；增加二手房到数据的导入功能 
+- 2024/06/20 增加户型，面积的抓取，增加租房数据到mysql的功能 
 - 2019/06/21 去除requirements.txt中的webbrower
 - 2018/11/05 增加工具下载二手房缩略图tool/download_ershou_image.py
 - 2018/11/01 增加二手房缩略图地址

diff --git a/ershou_to_db.py b/ershou_to_db.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+# coding=utf-8
+# author: peter lu
+# 此代码仅供学习与交流，请勿用于商业用途。
+# read data from csv, write to database(mysql)
+
+import os
+import pymysql
+from lib.utility.path import DATA_PATH
+from lib.zone.city import *
+from lib.utility.date import *
+from lib.utility.version import PYTHON_3
+from lib.spider.base_spider import SPIDER_NAME
+import records
+
+pymysql.install_as_MySQLdb()
+
+
+def create_prompt_text():
+    city_info = list()
+    num = 0
+    for en_name, ch_name in cities.items():
+        num += 1
+        city_info.append(en_name)
+        city_info.append(": ")
+        city_info.append(ch_name)
+        if num % 4 == 0:
+            city_info.append("\n")
+        else:
+            city_info.append(", ")
+    return 'Which city data do you want to save ?\n' + ''.join(city_info)
+
+
+if __name__ == '__main__':
+
+    ##################################
+    db = None
+    collection = None
+    workbook = None
+    csv_file = None
+    datas = list()
+    # 目标数据库 mysql
+    db = records.Database('mysql://root:pass1234@localhost/house_data?charset=utf8&autocommit=True'
+                          ,pool_size=5
+                          ,max_overflow=-1
+                          ,pool_timeout=5
+                          )
+
+    # 获取城市，默认杭州，如果需要其它城市，打开下面的注释
+    # city = get_city()
+    city = "hz"
+    # 准备日期信息，爬到的数据存放到日期相关文件夹下
+    date = get_date_string()
+    # 获得 csv 文件路径
+    # date = "20180331"   # 指定采集数据的日期
+    # city = "sh"         # 指定采集数据的城市
+    city_ch = get_chinese_city(city)
+    csv_dir = "{0}/{1}/ershou/{2}/{3}".format(DATA_PATH, SPIDER_NAME, city, date)
+
+    files = list()
+    if not os.path.exists(csv_dir):
+        print("{0} does not exist.".format(csv_dir))
+        print("Please run 'python zufang.py' firstly.")
+        print("Bye.")
+        exit(0)
+    else:
+        print('OK, start to process ' + get_chinese_city(city))
+    for csv in os.listdir(csv_dir):
+        data_csv = csv_dir + "/" + csv
+        # print(data_csv)
+        files.append(data_csv)
+
+    # 清理数据
+    count = 0
+    row = 0
+    col = 0
+    error_count = 0
+    for csv in files:
+        with open(csv, 'r', encoding='utf-8', errors='ignore') as f:
+            for line in f:
+                count += 1
+                text = line.strip()
+                try:
+                    # 如果小区名里面没有逗号，那么总共是6项
+                    if text.count(',') >= 7:
+                        text_array = text.split(',')
+                        date = text_array[0]
+                        district = text_array[1]
+                        area = text_array[2]
+                        xiaoqu = text_array[3]
+                        layout = text_array[4]
+                        building_space = text_array[5]
+                        price = text_array[6]
+                        total_price = text_array[7]
+                        desc = text_array[8]
+                    else:
+                        print("数据解析异常：{0}".format(text))
+                        error_count = error_count + 1
+                        continue
+                    ## 建筑面积处理
+                    building_space = building_space.replace(r"平米","")
+                    building_space = float(building_space)
+                    ## 单价处理
+                    price = price.replace(r"元/平","")
+                    price = int(price)
+                    ## 总价处理 fixme
+                    total_price = total_price.replace(r"万","")
+                    print("count({0}):{1}:{2}:{3}:{4}:{5}:{6}:{7}:{8}:{9}:{10}"
+                          .format(count, city_ch, date, district, area, xiaoqu, layout, building_space, price, total_price, desc))
+                    # 写入mysql数据库
+                    db.query('INSERT INTO ershou (city,date,district,area,xiaoqu,layout,building_space,price,total_price,`desc`)'
+                             'VALUES(:city, :date, :district, :area, :xiaoqu, :layout, :building_space, :price, :total_price, :desc)',
+                             city=city_ch, date=date, district=district, area=area, xiaoqu=xiaoqu, layout=layout, building_space=building_space, price=price, total_price=total_price, desc=desc)
+                except Exception as e:
+                    print(text)
+                    print(e)
+                    error_count = error_count + 1
+                    raise e
+
+    # 写入，并且关闭句柄
+    db.close()
+    print("Total write {0}, total error {1} items to database.".format(count, error_count))
diff --git a/ershou_公寓.py b/ershou_公寓.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+# coding=utf-8
+# author: Peter Lu
+# 此代码仅供学习与交流，请勿用于商业用途。
+# 获得指定城市的二手房数据
+# tag为过滤条件，多可条件可合并， 不加条件最多可爬100页，多于100页的数据爬不到
+
+from lib.spider.ershou_spider import *
+
+if __name__ == "__main__":
+    spider = ErShouSpider(SPIDER_NAME)
+    spider.tag = "sf2"
+    spider.start()
+
diff --git a/lib/item/ershou.py b/lib/item/ershou.py
@@ -6,18 +6,22 @@
 
 
 class ErShou(object):
-    def __init__(self, district, area, name, price, desc, pic):
+    def __init__(self, district, area, name, layout, building_space, price, total_price, desc):
         self.district = district
         self.area = area
-        self.price = price
         self.name = name
+        self.layout = layout
+        self.building_space = building_space
+        self.price = price
+        self.total_price = total_price
         self.desc = desc
-        self.pic = pic
 
     def text(self):
         return self.district + "," + \
                 self.area + "," + \
                 self.name + "," + \
+                self.layout + "," + \
+                self.building_space + "," + \
                 self.price + "," + \
-                self.desc + "," + \
-                self.pic
+                self.total_price + "," + \
+                self.desc
diff --git a/lib/spider/base_spider.py b/lib/spider/base_spider.py
@@ -11,12 +11,12 @@
 import lib.utility.version
 import random
 
-thread_pool_size = 50
+thread_pool_size = 20
 
 # 防止爬虫被禁，随机延迟设定
 # 如果不想delay，就设定False，
 # 具体时间可以修改random_delay()，由于多线程，建议数值大于10
-RANDOM_DELAY = False
+RANDOM_DELAY = True
 LIANJIA_SPIDER = "lianjia"
 BEIKE_SPIDER = "ke"
 # SPIDER_NAME = LIANJIA_SPIDER
@@ -27,7 +27,7 @@ class BaseSpider(object):
     @staticmethod
     def random_delay():
         if RANDOM_DELAY:
-            time.sleep(random.randint(0, 16))
+            time.sleep(random.randint(0, 50))
 
     def __init__(self, name):
         self.name = name

diff --git a/lib/spider/ershou_spider.py b/lib/spider/ershou_spider.py
@@ -18,6 +18,8 @@
 
 
 class ErShouSpider(BaseSpider):
+    # 过滤条件，多可条件可合并
+    tag: str = None
     def collect_area_ershou_data(self, city_name, area_name, fmt="csv"):
         """
         对于每个板块,获得这个板块下所有二手房的信息
@@ -31,7 +33,7 @@ def collect_area_ershou_data(self, city_name, area_name, fmt="csv"):
         csv_file = self.today_path + "/{0}_{1}.csv".format(district_name, area_name)
         with open(csv_file, "w") as f:
             # 开始获得需要的板块数据
-            ershous = self.get_area_ershou_info(city_name, area_name)
+            ershous = self.get_area_ershou_info(city_name, area_name, self.tag)
             # 锁定，多线程读写
             if self.mutex.acquire(1):
                 self.total_num += len(ershous)
@@ -44,9 +46,10 @@ def collect_area_ershou_data(self, city_name, area_name, fmt="csv"):
         print("Finish crawl area: " + area_name + ", save data to : " + csv_file)
 
     @staticmethod
-    def get_area_ershou_info(city_name, area_name):
+    def get_area_ershou_info(city_name, area_name, tag):
         """
         通过爬取页面获得城市指定版块的二手房信息
+        :param tag:
         :param city_name: 城市
         :param area_name: 版块
         :return: 二手房数据列表
@@ -59,7 +62,10 @@ def get_area_ershou_info(city_name, area_name):
         chinese_area = chinese_area_dict.get(area_name, "")
 
         ershou_list = list()
-        page = 'http://{0}.{1}.com/ershoufang/{2}/'.format(city_name, SPIDER_NAME, area_name)
+        if(tag != None):
+            page = 'http://{0}.{1}.com/ershoufang/{2}/{3}'.format(city_name, SPIDER_NAME, area_name, tag)
+        else:
+            page = 'http://{0}.{1}.com/ershoufang/{2}/'.format(city_name, SPIDER_NAME, area_name)
         print(page)  # 打印版块页面地址
         headers = create_headers()
         response = requests.get(page, timeout=10, headers=headers)
@@ -77,7 +83,10 @@ def get_area_ershou_info(city_name, area_name):
 
         # 从第一页开始,一直遍历到最后一页
         for num in range(1, total_page + 1):
-            page = 'http://{0}.{1}.com/ershoufang/{2}/pg{3}'.format(city_name, SPIDER_NAME, area_name, num)
+            if(tag != None):
+                page = 'http://{0}.{1}.com/ershoufang/{2}/pg{3}{4}'.format(city_name, SPIDER_NAME, area_name, num, tag)
+            else:
+                page = 'http://{0}.{1}.com/ershoufang/{2}/pg{3}'.format(city_name, SPIDER_NAME, area_name, num)
             print(page)  # 打印每一页的地址
             headers = create_headers()
             BaseSpider.random_delay()
@@ -88,26 +97,33 @@ def get_area_ershou_info(city_name, area_name):
             # 获得有小区信息的panel
             house_elements = soup.find_all('li', class_="clear")
             for house_elem in house_elements:
-                price = house_elem.find('div', class_="totalPrice")
-                name = house_elem.find('div', class_='title')
-                desc = house_elem.find('div', class_="houseInfo")
-                pic = house_elem.find('a', class_="img").find('img', class_="lj-lazy")
-
-                # 继续清理数据
-                price = price.text.strip()
+                ## 小区名
+                name = house_elem.find('div', class_='positionInfo')
                 name = name.text.replace("\n", "")
+                name = name.replace(",", "")
+                ## 房型、面积
+                houseInfo = house_elem.find('div', class_="houseInfo")
+                houseInfo = houseInfo.text.split()
+                layout = houseInfo[2]
+                building_space = houseInfo[4]
+                ## 总价，单价
+                total_price = house_elem.find('div', class_="totalPrice")
+                total_price = total_price.text.replace("\n", "").replace(",", "").strip()
+                ## 单价
+                price = house_elem.find('div', class_="unitPrice")
+                price = price.text.replace("\n", "").replace(",", "").strip()
+                ## 描述
+                desc = house_elem.find('div', class_='title')
                 desc = desc.text.replace("\n", "").strip()
-                pic = pic.get('data-original').strip()
-                # print(pic)
-
-
+                desc = desc.replace(",", "")
                 # 作为对象保存
-                ershou = ErShou(chinese_district, chinese_area, name, price, desc, pic)
+                ershou = ErShou(chinese_district, chinese_area, name, layout, building_space, price, total_price, desc)
                 ershou_list.append(ershou)
         return ershou_list
 
     def start(self):
-        city = get_city()
+        # city = get_city()
+        city = "hz"
         self.today_path = create_date_path("{0}/ershou".format(SPIDER_NAME), city, self.date_string)
 
         t1 = time.time()  # 开始计时