Reptile battle ( 6、 ... and ): Climbing pen Pavilion
One 、 Website analysis
1、 Page analysis
Through packet capturing analysis, we can get , This website is a static website , All information is saved in the page source code , You can get information directly from the page source code .
In catalog , It is found that all of them are stored in one dl Label
In the search directory , The search content is stored in a list , We can know whether we find the result by getting the length of the list ; If no results are found , The list will not be rendered , We can go through <div class="novelslistss"></div>
To determine whether the search is successful
2、 Source code analysis
By analyzing the source , Find out , The catalogue is wrapped in <dl></dl>
, The content is wrapped in a <div id="content"></div>
in
3、 Link analysis
# Through several searches, you can get several url
# https://www.bqg.org/modules/article/search.php?searchkey=%BE%F8%CA%C0%CE%E4%C9%F1 # Peerless martial god
# https://www.bqg.org/modules/article/search.php?searchkey=%BE%F8%CA%C0%CC%C6%C3%C5 # Peerless Tang clan
url = "https://www.bqg.org/modules/article/search.php?searchkey=%s"
from urllib import parse
print(parse.quote(" Peerless martial god ".encode("gbk")))
# After many attempts , Find out , First, the text content gbk code , Proceed again url code
# so , We can simulate search tools , To find the content accurately
name = input(" Please enter the title of the book :") # Get the book title entered by the user
print(url % parse.quote(name.encode("gbk"))) # obtain url
Two 、 Write code
1、 Get directory
# !/usr/bin/python3
# -*- coding: UTF-8 -*-
__author__ = "A.L.Kun"
__file__ = "biquge.py"
__time__ = "2022/7/6 14:03"
# Import the necessary modules
import requests, re # Send a request , Use regular to get everything
from urllib import parse # Code the title of the book
from fake_useragent import UserAgent # UA camouflage
# This time I got the catalogue of Fengqi Longcheng
url = "https://www.bqg.org/53_53985/" # To crawl url
headers = {
'Accept': ' text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': ' gzip, deflate, br',
'Accept-Language': ' zh-CN,zh;q=0.9',
'Cache-Control': ' no-cache',
'Connection': ' keep-alive',
'Cookie': ' jieqiVisitId=article_articleviews%3D53982; jieqiVisitTime=jieqiArticlesearchTime%3D1657088500; clickbids=53982',
'Host': ' www.bqg.org',
'Pragma': ' no-cache',
'Referer': ' https://www.bqg.org/',
'sec-ch-ua': ' " Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
'sec-ch-ua-mobile': ' ?0',
'sec-ch-ua-platform': ' "Windows"',
'Sec-Fetch-Dest': ' document',
'Sec-Fetch-Mode': ' navigate',
'Sec-Fetch-Site': ' same-origin',
'Sec-Fetch-User': ' ?1',
'Upgrade-Insecure-Requests': ' 1',
'user-agent': UserAgent().random
}
def get_menu(url):
headers.update({'user-agent': UserAgent().random}) # Random request header
resp = requests.get(url, headers) # Initiate request
resp.encoding = resp.apparent_encoding # The code of the file to be saved should be gbk
temp = re.search(" Text .*?</dl>", resp.text).group() # To shear , Remove the newly updated directory at the beginning
lis = re.findall(r'<a href="(.*?)">(.*?)</a>', temp) # Use regular to get links to directories and corresponding directories
url_de = []
chapters = []
for i in lis:
url_de.append(f"{url}{i[0]}")
chapters.append(i[1])
# print(url_de, chapters)
return url_de, chapters # Return results
get_menu(url)
2、 Access directory
def get_content(url):
headers.update({'user-agent': UserAgent().random}) # Random request header
resp = requests.get(url, headers) # Get a response
resp.encoding = resp.apparent_encoding # code
content = re.search('<div id="content" name="content">(?P<content>.*?)</div>', resp.text, re.S).group("content") # Get content
content = re.sub(r'<br.*?>', "\n", content) # Replace cleaning data with regular
content = re.sub(r' ', " ", content) # Replace cleaning data with regular
return content # Return results
get_content("https://www.bqg.org/53_53985/38645272.html") # Test use
3、 Download data
def save_data(url_):
with open("./ The wind rises from the dragon city .txt", "w", encoding="utf-8") as f:
urls, menus = get_menu(url_)
for index, url in enumerate(urls):
f.write(f"==============={menus[index]}===============\n")
print(f" Downloading :{menus[index]}")
content = get_content(url)
f.write(content + "\n")
print(f"{menus[index]}, Download complete ")
save_data("https://www.bqg.org/53_53985/")
4、 Search function
url = "https://www.bqg.org/modules/article/search.php?searchkey=%s"
from lxml import etree
from prettytable import PrettyTable
def have_content(text):
# When content is found , Display the found content for users to choose
html = etree.HTML(text)
li_s = html.xpath("//*[@id='main']/div[1]/li")
table = PrettyTable([' Serial number '," type ", ' Title ',' author '])
lis_ = []
index = 0
for index, li in enumerate(li_s):
type_ = li.xpath("./span[@class='s1']/text()")
name = li.xpath("./span[@class='s2']/a/text()")
url = li.xpath("./span[@class='s2']/a/@href")
lis_.append(url)
author = li.xpath("./span[@class='s4']/text()")
table.add_row([index + 1,type_, name, table])
print(table)
i = input(" Please enter the serial number to download :")
if i < index and i > 0:
return lis_[i - 1]
print(" Please input as required !")
def search(n):
arg = parse.quote(n.encode("gbk"))
headers.update({'user-agent': UserAgent().random}) # Random request header
resp = requests.get(url % arg, headers) # Initiate request
resp.encoding = resp.apparent_encoding # Set the current code
tet = resp.text # Get content
if re.search(" The reason for the error : I'm sorry , No search .*? article !", tet):
print(" I'm sorry , The article you want is not searched !")
return
if re.search(" The reason for the error : I'm sorry , Two searches .*? second ", tet):
print(" I'm sorry , The interval between two searches shall not be less than 30 second ")
return
# The rest are found
have_content(tet) # Get the specific download name
name = input(" Please enter the novel to download :")
search(name)
3、 ... and 、 Master code
# !/usr/bin/python3
# -*- coding: UTF-8 -*-
__author__ = "A.L.Kun"
__file__ = "biquge.py"
__time__ = "2022/7/6 14:03"
# Import the necessary modules
import requests, re, sys # Send a request , Use regular to get everything
from urllib import parse # Code the title of the book
from fake_useragent import UserAgent # UA camouflage
url = "https://www.bqg.org/modules/article/search.php?searchkey=%s" # Look for url
headers = {
'Accept': ' text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': ' gzip, deflate, br',
'Accept-Language': ' zh-CN,zh;q=0.9',
'Cache-Control': ' no-cache',
'Connection': ' keep-alive',
'Cookie': ' jieqiVisitId=article_articleviews%3D53982; jieqiVisitTime=jieqiArticlesearchTime%3D1657088500; clickbids=53982',
'Host': ' www.bqg.org',
'Pragma': ' no-cache',
'Referer': ' https://www.bqg.org/',
'sec-ch-ua': ' " Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
'sec-ch-ua-mobile': ' ?0',
'sec-ch-ua-platform': ' "Windows"',
'Sec-Fetch-Dest': ' document',
'Sec-Fetch-Mode': ' navigate',
'Sec-Fetch-Site': ' same-origin',
'Sec-Fetch-User': ' ?1',
'Upgrade-Insecure-Requests': ' 1',
'user-agent': UserAgent().random
}
def have_content(text):
# When content is found , Display the found content for users to choose
html = etree.HTML(text)
li_s = html.xpath("//*[@id='main']/div[1]/li")
table = PrettyTable([' Serial number '," type ", ' Title ',' author '])
lis_ = []
index = 0
for index, li in enumerate(li_s):
type_ = li.xpath("./span[@class='s1']/text()")[0]
name = li.xpath("./span[@class='s2']/a/text()")[0]
url = li.xpath("./span[@class='s2']/a/@href")[0]
lis_.append(url)
author = li.xpath("./span[@class='s4']/text()")[0]
table.add_row([index + 1,type_, name, table])
print(table)
i = input(" Please enter the serial number to download :")
if i <= index and i > 0:
return lis_[i - 1]
print(" Please input as required !")
sys.exit(0)
def search(n):
arg = parse.quote(n.encode("gbk"))
headers.update({'user-agent': UserAgent().random}) # Random request header
resp = requests.get(url % arg, headers) # Initiate request
resp.encoding = resp.apparent_encoding # Set the current code
tet = resp.text # Get content
if re.search(" The reason for the error : I'm sorry , No search .*? article !", tet):
print(" I'm sorry , The article you want is not searched !")
return
if re.search(" The reason for the error : I'm sorry , Two searches .*? second ", tet):
print(" I'm sorry , The interval between two searches shall not be less than 30 second ")
return
# The rest are found
return have_content(tet) # Get the specific download name , And the url return
def get_menu(url):
headers.update({'user-agent': UserAgent().random}) # Random request header
resp = requests.get(url, headers) # Initiate request
resp.encoding = resp.apparent_encoding # The code of the file to be saved should be gbk
temp = re.search(" Text .*?</dl>", resp.text).group() # To shear , Remove the newly updated directory at the beginning
lis = re.findall(r'<a href="(.*?)">(.*?)</a>', temp) # Use regular to get links to directories and corresponding directories
url_de = []
chapters = []
for i in lis:
url_de.append(f"{url}{i[0]}")
chapters.append(i[1])
# print(url_de, chapters)
return url_de, chapters # Return results
def get_content(url):
headers.update({'user-agent': UserAgent().random}) # Random request header
resp = requests.get(url, headers) # Get a response
resp.encoding = resp.apparent_encoding # code
content = re.search('<div id="content" name="content">(?P<content>.*?)</div>', resp.text, re.S).group("content") # Get content
content = re.sub(r'<br.*?>', "\n", content) # Replace cleaning data with regular
content = re.sub(r' ', " ", content) # Replace cleaning data with regular
return content # Return results
def save_data(url_, name):
with open(f"./{name}.txt", "w", encoding="utf-8") as f:
urls, menus = get_menu(url_) # Get directory information
for index, url in enumerate(urls): # Traverse links
f.write(f"==============={menus[index]}===============\n")
print(f" Downloading :{menus[index]}")
content = get_content(url) # Get details
f.write(content + "\n") # write file
print(f"{menus[index]}, Download complete ") # Prompt information
print("--------------------------")
def main():
name = input(" Please enter the name of the novel to download :")
save_data(search(name), name)
print(" Download complete ")
if __name__ == "__main__":
main()