当前位置:网站首页>Reptile practice (VI): novel of climbing pen interesting Pavilion

Reptile practice (VI): novel of climbing pen interesting Pavilion

2022-07-07 01:39:00 A-L-Kun

Reptile battle ( 6、 ... and ): Climbing pen Pavilion

One 、 Website analysis

1、 Page analysis

Through packet capturing analysis, we can get , This website is a static website , All information is saved in the page source code , You can get information directly from the page source code .

In catalog , It is found that all of them are stored in one dl Label

In the search directory , The search content is stored in a list , We can know whether we find the result by getting the length of the list ; If no results are found , The list will not be rendered , We can go through <div class="novelslistss"></div> To determine whether the search is successful

2、 Source code analysis

By analyzing the source , Find out , The catalogue is wrapped in <dl></dl>, The content is wrapped in a <div id="content"></div> in

#  Through several searches, you can get several url
# https://www.bqg.org/modules/article/search.php?searchkey=%BE%F8%CA%C0%CE%E4%C9%F1 #  Peerless martial god 
# https://www.bqg.org/modules/article/search.php?searchkey=%BE%F8%CA%C0%CC%C6%C3%C5 #  Peerless Tang clan 
url = "https://www.bqg.org/modules/article/search.php?searchkey=%s"
from urllib import parse
print(parse.quote(" Peerless martial god ".encode("gbk")))  
#  After many attempts , Find out , First, the text content gbk code , Proceed again url code 
#  so , We can simulate search tools , To find the content accurately 

name = input(" Please enter the title of the book :")  #  Get the book title entered by the user 
print(url % parse.quote(name.encode("gbk")))  #  obtain url

Two 、 Write code

1、 Get directory

# !/usr/bin/python3
# -*- coding: UTF-8 -*-
__author__ = "A.L.Kun"
__file__ = "biquge.py"
__time__ = "2022/7/6 14:03"
#  Import the necessary modules 
import requests, re  #  Send a request , Use regular to get everything 
from urllib import parse  #  Code the title of the book 
from fake_useragent import UserAgent  # UA camouflage 

#  This time I got the catalogue of Fengqi Longcheng 

url = "https://www.bqg.org/53_53985/"  #  To crawl url
headers = {
    'Accept': ' text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': ' gzip, deflate, br',
    'Accept-Language': ' zh-CN,zh;q=0.9',
    'Cache-Control': ' no-cache',
    'Connection': ' keep-alive',
    'Cookie': ' jieqiVisitId=article_articleviews%3D53982; jieqiVisitTime=jieqiArticlesearchTime%3D1657088500; clickbids=53982',
    'Host': ' www.bqg.org',
    'Pragma': ' no-cache',
    'Referer': ' https://www.bqg.org/',
    'sec-ch-ua': ' " Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
    'sec-ch-ua-mobile': ' ?0',
    'sec-ch-ua-platform': ' "Windows"',
    'Sec-Fetch-Dest': ' document',
    'Sec-Fetch-Mode': ' navigate',
    'Sec-Fetch-Site': ' same-origin',
    'Sec-Fetch-User': ' ?1',
    'Upgrade-Insecure-Requests': ' 1',
    'user-agent': UserAgent().random
}
def get_menu(url):
    headers.update({'user-agent': UserAgent().random})  #  Random request header 
    resp = requests.get(url, headers)  #  Initiate request 
    resp.encoding = resp.apparent_encoding  #  The code of the file to be saved should be gbk
    temp = re.search(" Text .*?</dl>", resp.text).group()  #  To shear , Remove the newly updated directory at the beginning 
    lis = re.findall(r'<a href="(.*?)">(.*?)</a>', temp)  #  Use regular to get links to directories and corresponding directories 
    url_de = []
    chapters = []
    for i in lis:
        url_de.append(f"{url}{i[0]}")
        chapters.append(i[1])
    # print(url_de, chapters)
    return url_de, chapters  #  Return results 

get_menu(url)

2、 Access directory

def get_content(url):
    headers.update({'user-agent': UserAgent().random})  #  Random request header 
    resp = requests.get(url, headers)  #  Get a response 
    resp.encoding = resp.apparent_encoding  #  code 
    content = re.search('<div id="content" name="content">(?P<content>.*?)</div>', resp.text, re.S).group("content")  #  Get content 
    content = re.sub(r'<br.*?>', "\n", content)  #  Replace cleaning data with regular 
    content = re.sub(r'&nbsp;', " ", content)  #  Replace cleaning data with regular 
    return content  #  Return results 
    
    
get_content("https://www.bqg.org/53_53985/38645272.html")  #  Test use 

3、 Download data

def save_data(url_):
    with open("./ The wind rises from the dragon city .txt", "w", encoding="utf-8") as f:
        urls, menus = get_menu(url_)
        for index, url in enumerate(urls):
            f.write(f"==============={menus[index]}===============\n")
            print(f" Downloading :{menus[index]}")
            content = get_content(url)
            f.write(content + "\n")
            print(f"{menus[index]}, Download complete ")
        
save_data("https://www.bqg.org/53_53985/")
url = "https://www.bqg.org/modules/article/search.php?searchkey=%s"
from lxml import etree
from prettytable import PrettyTable


def have_content(text):
    #  When content is found , Display the found content for users to choose 
    html = etree.HTML(text)
    li_s = html.xpath("//*[@id='main']/div[1]/li")
    table = PrettyTable([' Serial number '," type ", ' Title ',' author '])
    lis_ = []
    index = 0
    for index, li in enumerate(li_s):
        type_ = li.xpath("./span[@class='s1']/text()")
        name = li.xpath("./span[@class='s2']/a/text()")
        url = li.xpath("./span[@class='s2']/a/@href")
        lis_.append(url)
        author = li.xpath("./span[@class='s4']/text()")
        table.add_row([index + 1,type_, name, table])
    
    print(table)
    i = input(" Please enter the serial number to download :")
    if i < index and i > 0:
        return lis_[i - 1]
    print(" Please input as required !")
    

def search(n):
    arg = parse.quote(n.encode("gbk"))
    headers.update({'user-agent': UserAgent().random})  #  Random request header 
    resp = requests.get(url % arg, headers)  #  Initiate request 
    resp.encoding = resp.apparent_encoding  #  Set the current code 
    tet = resp.text  #  Get content 
    if re.search(" The reason for the error : I'm sorry , No search .*? article !", tet):
        print(" I'm sorry , The article you want is not searched !")
        return
    if re.search(" The reason for the error : I'm sorry , Two searches .*? second ", tet):
        print(" I'm sorry , The interval between two searches shall not be less than 30 second ")
        return 
    #  The rest are found 
    have_content(tet)  #  Get the specific download name 

    
name = input(" Please enter the novel to download :")
search(name)

3、 ... and 、 Master code

# !/usr/bin/python3
# -*- coding: UTF-8 -*-
__author__ = "A.L.Kun"
__file__ = "biquge.py"
__time__ = "2022/7/6 14:03"
#  Import the necessary modules 
import requests, re, sys  #  Send a request , Use regular to get everything 
from urllib import parse  #  Code the title of the book 
from fake_useragent import UserAgent  # UA camouflage 

url = "https://www.bqg.org/modules/article/search.php?searchkey=%s"  #  Look for url
headers = {
    'Accept': ' text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': ' gzip, deflate, br',
    'Accept-Language': ' zh-CN,zh;q=0.9',
    'Cache-Control': ' no-cache',
    'Connection': ' keep-alive',
    'Cookie': ' jieqiVisitId=article_articleviews%3D53982; jieqiVisitTime=jieqiArticlesearchTime%3D1657088500; clickbids=53982',
    'Host': ' www.bqg.org',
    'Pragma': ' no-cache',
    'Referer': ' https://www.bqg.org/',
    'sec-ch-ua': ' " Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
    'sec-ch-ua-mobile': ' ?0',
    'sec-ch-ua-platform': ' "Windows"',
    'Sec-Fetch-Dest': ' document',
    'Sec-Fetch-Mode': ' navigate',
    'Sec-Fetch-Site': ' same-origin',
    'Sec-Fetch-User': ' ?1',
    'Upgrade-Insecure-Requests': ' 1',
    'user-agent': UserAgent().random
}



def have_content(text):
    #  When content is found , Display the found content for users to choose 
    html = etree.HTML(text)
    li_s = html.xpath("//*[@id='main']/div[1]/li")
    table = PrettyTable([' Serial number '," type ", ' Title ',' author '])
    lis_ = []
    index = 0
    for index, li in enumerate(li_s):
        type_ = li.xpath("./span[@class='s1']/text()")[0]
        name = li.xpath("./span[@class='s2']/a/text()")[0]
        url = li.xpath("./span[@class='s2']/a/@href")[0]
        lis_.append(url)
        author = li.xpath("./span[@class='s4']/text()")[0]
        table.add_row([index + 1,type_, name, table])
    
    print(table)
    i = input(" Please enter the serial number to download :")
    if i <= index and i > 0:
        return lis_[i - 1]
    print(" Please input as required !")
    sys.exit(0)
    

def search(n):
    arg = parse.quote(n.encode("gbk"))
    headers.update({'user-agent': UserAgent().random})  #  Random request header 
    resp = requests.get(url % arg, headers)  #  Initiate request 
    resp.encoding = resp.apparent_encoding  #  Set the current code 
    tet = resp.text  #  Get content 
    if re.search(" The reason for the error : I'm sorry , No search .*? article !", tet):
        print(" I'm sorry , The article you want is not searched !")
        return
    if re.search(" The reason for the error : I'm sorry , Two searches .*? second ", tet):
        print(" I'm sorry , The interval between two searches shall not be less than 30 second ")
        return 
    #  The rest are found 
    return have_content(tet)  #  Get the specific download name , And the url return 

def get_menu(url):
    headers.update({'user-agent': UserAgent().random})  #  Random request header 
    resp = requests.get(url, headers)  #  Initiate request 
    resp.encoding = resp.apparent_encoding  #  The code of the file to be saved should be gbk
    temp = re.search(" Text .*?</dl>", resp.text).group()  #  To shear , Remove the newly updated directory at the beginning 
    lis = re.findall(r'<a href="(.*?)">(.*?)</a>', temp)  #  Use regular to get links to directories and corresponding directories 
    url_de = []
    chapters = []
    for i in lis:
        url_de.append(f"{url}{i[0]}")
        chapters.append(i[1])
    # print(url_de, chapters)
    return url_de, chapters  #  Return results 


def get_content(url):
    headers.update({'user-agent': UserAgent().random})  #  Random request header 
    resp = requests.get(url, headers)  #  Get a response 
    resp.encoding = resp.apparent_encoding  #  code 
    content = re.search('<div id="content" name="content">(?P<content>.*?)</div>', resp.text, re.S).group("content")  #  Get content 
    content = re.sub(r'<br.*?>', "\n", content)  #  Replace cleaning data with regular 
    content = re.sub(r'&nbsp;', " ", content)  #  Replace cleaning data with regular 
    return content  #  Return results 

def save_data(url_, name):
    with open(f"./{name}.txt", "w", encoding="utf-8") as f:
        urls, menus = get_menu(url_)  #  Get directory information 
        for index, url in enumerate(urls):  #  Traverse links 
            f.write(f"==============={menus[index]}===============\n")
            print(f" Downloading :{menus[index]}")
            content = get_content(url)  #  Get details 
            f.write(content + "\n")  #  write file 
            print(f"{menus[index]}, Download complete ")  #  Prompt information 
            print("--------------------------")
    
            
def main():
    name = input(" Please enter the name of the novel to download :")
    save_data(search(name), name)
    print(" Download complete ")
    
if __name__ == "__main__":
    main()
原网站

版权声明
本文为[A-L-Kun]所创,转载请带上原文链接,感谢
https://yzsam.com/2022/188/202207061807155947.html