当前位置:网站首页>Reptile practice (V): climbing watercress top250

Reptile practice (V): climbing watercress top250

2022-07-06 21:42:00 A-L-Kun

Reptile battle ( 5、 ... and ): Climb Douban top250

One 、 Website analysis

1、 Page analysis

Through packet capturing analysis , Available data is not dynamically loaded , But static pages , So we can send the request directly to the page , You can get the data

2、 Source code analysis

adopt F12 Debugging tools can get page data , namely , The data of this page , Stored in a class named grid_view Of ol Inside the label , At the same time, the class name is unique in the page , So we can use this node to locate our data , Traverse li label , Get content

3、 Content analysis

"""
1. https://movie.douban.com/top250?start=0
2. https://movie.douban.com/top250?start=25
3. https://movie.douban.com/top250?start=50
n. https://movie.douban.com/top250?start=25*(n-1)
"""
urls = [https://movie.douban.com/top250?start=25*(i-1) for i in range(11)]  #  There is a total of 250 movie 

so , We can use for loop , Or Mr. Cheng link , Use the form of stack to access , Or you can crawl the page recursively

Two 、 Write code

1、 Get every page url

# !/usr/bin/python3
# -*- coding: UTF-8 -*-
__author__ = "A.L.Kun"
__file__ = "123.py"
__time__ = "2022/7/6 10:19"

import requests, re  #  The import module 
from lxml import etree  #  Conduct xpath analysis 
from fake_useragent import UserAgent  #  Use random request headers 
import pandas as pd  #  Import data parsing module fast 

urls = [f'https://movie.douban.com/top250?start={25*(i-1)}' for i in range(10, 0, -1)]  #  Get all of url link , Store as global variable , Use the method of stack , So take the reverse order 

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Host': 'movie.douban.com',
    'Pragma': 'no-cache',
    'sec-ch-ua': '"Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'user-agent': UserAgent().random
}

lis_data = []  #  Store the crawled data 

while urls:
    print(urls.pop())

2、 obtain ol Inside li label

def get_tags(url):
    headers.update({'user-agent': UserAgent().random})  #  bring user-agent It's random enough 
    resp = requests.get(url, headers=headers)  #  Send a request 
    resp.encoding = "utf-8"  #  Set encoding 
    tree = etree.HTML(resp.text)  #  Get the page source , hand etree
    ol = tree.xpath('//*[@id="content"]/div/div[1]/ol/li')  #  Get ol
    for li in ol:  
        print(li)
    
get_tags("https://movie.douban.com/top250?start=0")

3、 get data

def get_data(li):
    imgSrc = li.xpath(".//img/@src")[0]  #  Picture links 
    try:
        imgSrc = imgSrc.replace("webp", "jpg")
    except Exception as e:
        imgSrc = " Picture not found "
    title = li.xpath(".//img/@alt")[0]  #  title 
    detailUrl = li.xpath(".//div[@class='hd']/a/@href")[0]  #  Detailed address 
    detail = li.xpath(".//div[@class='bd']/p[1]/text()")  #  It contains the director , year , type , We only need the year and type 
    time = re.search(r"\d+", detail[1]).group()  #  Year of publication 
    type_ = " ".join(re.findall(r"[\u4e00-\u9fa5]+", detail[1]))  #  Film type 
    score = li.xpath(".//span[@class='rating_num']/text()")[0]  #  Get the score 
    quote = li.xpath(".//span[@class='inq']/text()")[0]  #  Movie motto 
    # print(title, imgSrc, detailUrl, time, type_, score, quote)  #  Output the acquired data 
    lis_data.append({
        " title ": title,
        " Picture links ": imgSrc,
        " Details page link ": detailUrl,
        " Publication date ": time,
        " Film type ": type_,
        " score ": score,
        " Maxim ": quote
    })  #  Store the results in a prepared container , Submit to pandas Library for parsing , Here you can also write data to the database 
    

#  Test use 
resp = requests.get("https://movie.douban.com/top250?start=25", headers=headers)
resp.encoding = "utf-8"
tree = etree.HTML(resp.text)  #  Get the page source , hand etree
ol = tree.xpath('//*[@id="content"]/div/div[1]/ol/li')  #  Get ol
for li in ol :
    get_data(li)
print(lis_data)

4、 Data cleaning

def parse_data():
    df = pd.DataFrame(lis_data)
    new_df = df.dropna()  #  Sure , Discard empty data 
    #  At the same time, you can also do some chart analysis and other work , Let's omit 
    # print(new_df)
    new_df.to_excel("./douban.xlsx", index=None)

parse_data()

3、 ... and 、 Complete code

# !/usr/bin/python3
# -*- coding: UTF-8 -*-
__author__ = "A.L.Kun"
__file__ = "123.py"
__time__ = "2022/7/6 10:19"

import requests, re  #  The import module 
from lxml import etree  #  Conduct xpath analysis 
from fake_useragent import UserAgent  #  Use random request headers 
import pandas as pd  #  Import data parsing module fast 
from logging import Logger

log = Logger(__name__)

urls = [f'https://movie.douban.com/top250?start={25*(i-1)}' for i in range(10, 0, -1)]  #  Get all of url link , Store as global variable , Use the method of stack , So take the reverse order 

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Host': 'movie.douban.com',
    'Pragma': 'no-cache',
    'sec-ch-ua': '"Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'user-agent': UserAgent().random
}

lis_data = []  #  Store the crawled data 

def get_data(li):
    imgSrc = li.xpath(".//img/@src")[0]  #  Picture links 
    try:
        imgSrc = imgSrc.replace("webp", "jpg")
    except Exception as e:
        imgSrc = " Picture not found "
    title = li.xpath(".//img/@alt")[0]  #  title 
    detailUrl = li.xpath(".//div[@class='hd']/a/@href")[0]  #  Detailed address 
    detail = li.xpath(".//div[@class='bd']/p[1]/text()")  #  It contains the director , year , type , We only need the year and type 
    time = re.search(r"\d+", detail[1]).group()  #  Year of publication 
    type_ = " ".join(re.findall(r"[\u4e00-\u9fa5]+", detail[1]))  #  Film type 
    score = li.xpath(".//span[@class='rating_num']/text()")[0]  #  Get the score 
    try:
        quote = li.xpath(".//span[@class='inq']/text()")[0]  #  Movie motto 
    except Exception as e:
        quote = " There is no maxim for the time being !"
    # print(title, imgSrc, detailUrl, time, type_, score, quote)  #  Output the acquired data 
    lis_data.append({
        " title ": title,
        " Picture links ": imgSrc,
        " Details page link ": detailUrl,
        " Publication date ": time,
        " Film type ": type_,
        " score ": score,
        " Maxim ": quote
    })  #  Store the results in a prepared container , Submit to pandas Library for parsing , Here you can also write data to the database 

def get_tags(url):
    headers.update({'user-agent': UserAgent().random})  #  bring user-agent It's random enough 
    resp = requests.get(url, headers=headers)  #  Send a request 
    resp.encoding = "utf-8"  #  Set encoding 
    tree = etree.HTML(resp.text)  #  Get the page source , hand etree
    ol = tree.xpath('//*[@id="content"]/div/div[1]/ol/li')  #  Get ol
    for li in ol:  
        get_data(li)  #  Get data 
    log.info(f"{url}, Data acquisition complete ")

def parse_data():
    df = pd.DataFrame(lis_data)
    new_df = df.dropna()  #  Sure , Discard empty data 
    #  At the same time, you can also do some chart analysis and other work , Let's omit 
    # print(new_df)
    new_df.to_excel("./douban.xlsx", index=None, encoding="utf-8")
    # print(new_df)

def main():
    while urls:
        get_tags(urls.pop())
    parse_data()

if __name__ == "__main__":
    main()
原网站

版权声明
本文为[A-L-Kun]所创,转载请带上原文链接,感谢
https://yzsam.com/2022/187/202207061315110169.html