爬虫 | 豆瓣小组-珠海租房监控器

Posted by Haauleon on September 1, 2021

  目前有找银石雅园的房子的需求,特地写了一个脚本放在服务器上去定时爬取,然后推送到钉钉群。可以根据关键字查找,我主要找的是银石雅园。


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# -*- coding:utf-8 -*-
"""
@Author  :   haauleon
@Contact :   753494552@qq.com
@File    :   douban_zufang.py
@Date    :   2021-08-26 12:00:00
@Function:   豆瓣小组-珠海租房监控器,目前有找银石雅园的房子的需求
"""
import requests
import re
from functools import wraps
import json

Search_Key = '银石雅园'

def traceback_error(func):
    @wraps(func)
    def wraper(self, *args, **kwargs):
        try:
            result = func(self, *args, **kwargs)
        except Exception as e:
            import traceback
            ex_msg = '{exception}'.format(exception=traceback.format_exc())
            print(ex_msg)
            result = ex_msg
        return result
    return wraper


class DingDingNotice:
    '''钉钉发送类'''

    def __init__(self, ding_token=None, atMobiles=None, isAtAll=None):
        # 根据电话@用户
        self.atMobiles = ['1397606xxxx',] if atMobiles==None else atMobiles
        # self.token = 'cbb3b771657ef' if ding_token==None else ding_token
        # 是否@所有人
        self.isAtAll = True if isAtAll==None else isAtAll
        self.token = '1bf33b1503399d9610fxxxxxxxxx'
        self.api = 'https://oapi.dingtalk.com/robot/send?access_token={}'.format(self.token)
        self.headers = {'Content-Type': 'application/json;charset=utf-8'}

    @traceback_error
    def send_msg(self,content):
        msg = {
            'msgtype': 'text',
            'text': {'content': content},
            'at': {'atMobiles': self.atMobiles, 'isAtAll': self.isAtAll}
        }
        data = requests.post(self.api, data=json.dumps(msg), headers=self.headers).json()
        return json.dumps(data)


class DoubanGroup:
    '''豆瓣小组'''

    def __init__(self):
        self.group_url = "https://www.douban.com/group/555279/discussion?start=%s"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
            'cache-control': "no-cache",
            'Cookie': 'xxxxx',
        }

    def get_group_titles(self, search_key):
        '''获取 3 页豆瓣小组-珠海租房的帖子'''
        zufangs = dict()
        for page in range(0, 51, 25):
            resp = requests.request("GET", self.group_url % page, headers=self.headers)

            titles = re.compile(r'title="(.*?)"',re.S)    
            titles = titles.findall(resp.text)            # 获取单页所有标题

            links = re.compile(r'href="(.*?)"',re.S)      
            links = links.findall(resp.text)              # 获取单页所有链接
            zufang_links = []             
            for j in links:
                if '/group/topic/' in j:
                    '''过滤非豆瓣小组的租房帖子'''
                    zufang_links.append(j)

            zufang = dict(zip(titles, zufang_links))
            zufangs.update(zufang)

        zufang_search = []
        for i,j in zufangs.items():
            if search_key in i:
                '''根据关键字搜索结果'''
                zufang_search.append("%s %s" %(i,j))  
        return zufang_search


def zufang_monitour_run():
    '''抓取豆瓣租房小组帖子脚本执行'''
    douban = DoubanGroup()
    dingding_msg = douban.get_group_titles(Search_Key)
    service = DingDingNotice()
    service.send_msg(
        content='豆瓣租房小组监控提醒!现在{}的租房帖子有{}条,分别是:\n\n'.format(Search_Key, len(dingding_msg)) + 
                '' + 
                '\n'.join(dingding_msg) + 
                '\n\n请及时查看!'
    )


if __name__ == '__main__':
    zufang_monitour_run()