当前位置：网站首页>Practical application of "experience" crawler in work

Practical application of "experience" crawler in work

2022-06-30 19:11:00 【Little fire dragon said data】

Estimated reading time ：10min

Reading suggestions ： This article is the code implementation , Recommended collection , Study slowly in your spare time .

Solve the pain ： Many students have some doubts about reptiles , Little fire dragon hopes to explain the basic principles of reptiles to you in simple words , And how to implement it through a simple piece of code , Help you get started as soon as possible , This article focuses on beginners of reptiles .

preface

In the last article , Little fire dragon shared with you the basic principles of reptiles , Look back and poke 『 Theory Chapter 』. This article , Little fire dragon takes you to a simple reptile by hand , Attach code , Interested students can try it on their own .

Reptile background

In the near future , A five-year mortgage LPR falling , The loan cost of house purchase has been reduced , So I want to see different areas of Beijing , At this stage （2022 year 5 month ）vs Before the outbreak （2019 year 5 month ） Price difference . The data comes from 「58 Same city 」.

The crawler code

Last article 『 Theory Chapter 』 in , Shared the general process of the crawler , There is no redundancy , Go straight to the code .

A brief introduction to the code structure , The input file has four , There are two output files （ The code can be written in a file , But between functional decoupling , Therefore, it is split into multiple , Focus on 「 Crawler core file 」）.

【 Input 】

Run master file （run.py）： Lift the entry file of the core program .

# encoding:utf-8
from configparser import ConfigParser, ExtendedInterpolation
import logging
import sys
import os

FILE_PATH = os.path.dirname(os.path.realpath(__file__))
CONF_PATH = FILE_PATH + '/conf/conf.ini'

#  Import the file in the directory where the current file is located 
sys.path.append(FILE_PATH + '/lib')
sys.path.append(FILE_PATH + '/spider')

from InfoSpider import InfoSpider

#  establish log Log function 
def init_logger():
    logger = logging.getLogger()
    logger.setLevel(level = logging.INFO)
    handler = logging.FileHandler("log/log.txt")
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    return logger

#  establish conf File functions 
def init_conf(config_path):
    cf = ConfigParser(interpolation=ExtendedInterpolation())
    cf.read(config_path, encoding='utf-8')
    return cf

if __name__ == '__main__':

    logger = init_logger()
    conf = init_conf(CONF_PATH)

    #  Crawling 58 data 
    info_spider = InfoSpider(conf)
    info_spider.getHouseInfo()

Crawler core file （InfoSpider.py）： Realize the core functions of the crawler .

#encoding:utf-8
import logging
import sys
import os
import urllib
import urllib.request, urllib.parse
import re
import time
import random
from GeneralObj import GeneralObj

FILE_PATH = os.path.dirname(os.path.realpath(__file__))
FILE_PATH_MAIN = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../")

# Import the file in the directory where the current file is located 
sys.path.append(FILE_PATH_MAIN + '/lib')

logger = logging.getLogger(__name__)
obj = GeneralObj()

class InfoSpider(object):

    def __init__(self, conf):
        self.conf = conf

        # Random access ua、ip
        ua_conf = self.conf.get('spider_58', 'ua')
        self.ua_list = ua_conf.strip().split('\\001')
        ip_conf = self.conf.get('spider_58', 'ip')
        self.ip_list = ip_conf.strip().split(',')

    def getHouseInfo(self):
        begin_time = time.time()

        # Get the URL of the sub region 
        self.getHouseInfo_area()

        # Get the website of the business district in the sub region 
        self.getHouseInfo_page()

        # Get the business district data of the subdivided region 
        self.getHouseInfo_detail()

        end_time = time.time()
        cost_time = (end_time - begin_time)//60
        logger.info('cost time(min) : %i' % cost_time)
        print('cost time(min) : %i' % cost_time)

    def getHouseInfo_area(self):
        logger.info('begin1 getHouseInfo_area url')
        print('begin1 getHouseInfo_area url')

        main_url = 'https://www.58.com/fangjiawang/shi-2022-100/'
        headers = {"User-Agent": random.sample(self.ua_list, 1)[0]}
        req = urllib.request.Request(main_url, headers=headers)
        data = urllib.request.urlopen(req).read()
        data = data.decode('utf-8')

        data_filter = re.findall('<ul class="sel-sec" data-v-4571decc>(.*?)</ul>', data)
        data_list = re.findall('<a href="(.*?)"', data_filter[0])[1:]
        obj.recordsToFile(data_list, './data/getHouseInfo_area.txt', type=1, delimiter="\t", mode="w")

        logger.info('finish1 getHouseInfo_area url')
        print('finish1 getHouseInfo_area url')

    def getHouseInfo_page(self):
        logger.info('begin2 getHouseInfo_page url')
        print('begin2 getHouseInfo_page url')

        area_url_list = obj.fileToRecords('./data/getHouseInfo_area.txt', 1)
        area_url_num = len(area_url_list)
        area_url_counts = 1

        for line in area_url_list:
            headers = {"User-Agent": random.sample(self.ua_list, 1)[0]}
            req = urllib.request.Request(line, headers=headers)
            data = urllib.request.urlopen(req).read()
            data = data.decode('utf-8')

            data_filter = re.findall('<ul class="sel-thi" data-v-4571decc>(.*?)</ul>', data)
            data_list = re.findall('<a href="(.*?)"', data_filter[0])[1:]
            obj.recordsToFile(data_list, './data/getHouseInfo_page.txt', type=1, delimiter="\t", mode="a+")

            logger.info('getHouseInfo_page url progress: %i/%i' % (area_url_counts, area_url_num))
            print('getHouseInfo_page url progress: %i/%i' % (area_url_counts, area_url_num))

            time.sleep(2)
            area_url_counts += 1

    def getHouseInfo_detail(self):
        logger.info('begin3 getHouseInfo_detail url')
        print('begin3 getHouseInfo_detail url')

        detail_url_list = obj.fileToRecords('./data/getHouseInfo_page.txt', 1)
        detail_url_num = len(detail_url_list)
        detail_url_counts = 1

        for line in detail_url_list:

            try:
                address2022 = line
                address2019 = line.replace('shi-2022-100', 'shi-2019-100')
                headers = {"User-Agent": random.sample(self.ua_list, 1)[0]}
                req2022 = urllib.request.Request(address2022, headers=headers)
                req2019 = urllib.request.Request(address2019, headers=headers)
                data2022 = urllib.request.urlopen(req2022).read()
                data2019 = urllib.request.urlopen(req2019).read()
                data2022 = data2022.decode('utf-8')
                data2019 = data2019.decode('utf-8')
                area_list = []

                #  obtain 2022
                data_bigarea = re.findall('<div class="m-t mt20" data-v-2a40ba50>2022(.*?) The average price of second-hand houses in all sectors </div>', data2022)
                area_list.append(data_bigarea[0])
                data_area = re.findall('2022 year (.*?) House price trend chart ', data2022)
                area_list.append(data_area[0])
                data_price_2022 = re.findall('2022 year 5 Monthly house price </b><span data-v-2a40ba50>(.*?) element /㎡', data2022)
                area_list.append(data_price_2022[0])
                #  obtain 2019
                data_price_2019 = re.findall('2019 year 5 Monthly house price </b><span data-v-2a40ba50>(.*?) element /㎡', data2019)
                area_list.append(data_price_2019[0])
                obj.recordsToFile(area_list, './data/getHouseInfo_detail.txt', type=4, delimiter="\t", mode="a+")

                logger.info('getHouseInfo_detail progress: %i/%i content:%s' % (detail_url_counts, detail_url_num, area_list))
                print('getHouseInfo_detail progress: %i/%i content:%s' % (detail_url_counts, detail_url_num, area_list))

                time.sleep(2)
                detail_url_counts += 1
            except:
                logger.info('getHouseInfo_detail progress: %i/%i NO CONTENT:%s' % (detail_url_counts, detail_url_num, line))
                print('getHouseInfo_detail progress: %i/%i NO CONTENT:%s' % (detail_url_counts, detail_url_num, line))

                time.sleep(2)
                detail_url_counts += 1
                continue

The configuration file （conf.ini）： Place variables that need to be changed frequently （ Filter section here ）.

[spider_58]
ua = Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)\001Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)\001Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1
ip = 125.112.76.113,110.85.124.116

Custom function file （GeneralObj.py）： Place user-defined functions .

#encoding:utf-8
import logging
import sys, os
import time
import random

FILE_PATH = os.path.dirname(os.path.realpath(__file__))

# Import the file in the directory where the current file is located 
sys.path.append(FILE_PATH + '../data')

class GeneralObj(object):

    def __init__(self):
        self.logger = logging.getLogger(__name__)

    # Pass in list, Store to local file 
    def recordsToFile(self, records, file_name, type=1, delimiter="\t", mode="w"):
        f = open(file_name, mode)
        if type==1:
            for line in records:
                f.write(line + "\n")
            f.close()
        if type==2:
            for line in records:
                f.write(delimiter.join(line) + "\n")
            f.close()
        if type==3:
            f.write(records)
            f.close()
        if type==4:
            f.write(delimiter.join(records) + "\n")
            f.close()

    # Read local file , Deposit in list
    def fileToRecords(self, file_name, type=1, delimiter="\t", mode="r"):
        lists = []
        f = open(file_name, mode)
        if type==1:
            for line in f.readlines():
                lists.append(line.strip())
            f.close()
            return lists
        if type==2:
            for line in f.readlines():
                lists.append(line.strip().split(delimiter))
            f.close()
            return lists

【 Output 】

Log files （log.txt）： Record the running log , Facilitate troubleshooting in case of problems .

Crawl result file （data.txt）： Place crawl results .

Data analysis

The following is the data crawling , The overall data of the six districts in Beijing and the price increases in various regions TOP5 Business circle .

The above is the content sharing of this issue .

原网站

版权声明
本文为[Little fire dragon said data]所创，转载请带上原文链接，感谢
https://yzsam.com/2022/181/202206301745441574.html

当前位置：网站首页>Practical application of "experience" crawler in work

Practical application of "experience" crawler in work

边栏推荐

猜你喜欢

随机推荐