当前位置：网站首页>Pyhton crawls Baidu library text and writes it into word document

Pyhton crawls Baidu library text and writes it into word document

2022-06-27 19:52:00 【Beidao end Lane】

Catalog

Introduce
- Request URL
- Crawl data
Complete code

Introduce

Only supports crawling Baidu Library Word file , Text writing Word Document or text file (.txt), The main use of Python Reptile requests library .
requests Kuo is Python In the crawler series, request libraries are popular, convenient and practical , in addition urlib library ( package ) It is also quite popular . besides Python The crawler series also has a parsing library lxml as well as Beautiful Soup,Python The crawler frame scrapy.

Request URL

Introduce to you headers How to use 、 And paging crawling ,headers Generally speaking, it only needs User-Agent That's enough .

def get_url(self):
        url = input(" Please enter the downloaded Library URL Address ：")
        headers = {
    
        	#  Receive request processing 
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            #  Declare the type of encoding supported by the browser 
            'Accept-Encoding': 'gzip, deflate, br',
            #  The acceptance language sent to the client browser 
            'Accept-Language': 'zh-CN,zh;q=0.9',
            #  Get browser cache 
            'Cache-Control': 'max-age=0',
            #  Send the next request to the same connection , Until one party actively closes the connection 
            'Connection': 'keep-alive',
            #  Main address ( Domain name of the server )
            'Host': 'wenku.baidu.com',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'same-origin',
            'Sec-Fetch-User': '?1',
            'Upgrade-Insecure-Requests': '1',
            #  Client identification certificate ( It's like an ID card )
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
        }
        response = self.session.get(url=url,headers=headers)
        json_data = re.findall('"json":(.*?}])', response.text)[0]
        json_data = json.loads(json_data)
        # print(json_data)
        for index, page_load_urls in enumerate(json_data):
            # print(page_load_urls)
            page_load_url = page_load_urls['pageLoadUrl']
            # print(index)
            self.get_data(index, page_load_url)

Crawl data

Get the server response to crawl the document data to write Word file , Can also be with open(‘ Baidu library .docx’, ‘a’, encoding=‘utf-8’) Medium .docx Change to .txt text file , In this way, a text file is written , The line feed function has not been added to write ！

def get_data(self, index, url):
        headers = {
    
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'wkbjcloudbos.bdimg.com',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
        }
        response = self.session.get(url=url,headers=headers)
        # print(response.content.decode('unicode_escape'))
        data = response.content.decode('unicode_escape')
        comand = 'wenku_' + str(index+1)
        json_data = re.findall(comand + "\((.*?}})\)", data)[0]
        # print(json_data)
        json_data = json.loads(json_data)
        result = []
        for i in json_data['body']:
            data = i["c"]
            # print(data)
            result.append(data)

        print(''.join(result).replace(' ', '\n'))
        print("")
        with open(' Baidu library .docx', 'a', encoding='utf-8') as f:
            f.write('')
            f.write(''.join(result).replace(' ', '\n'))

Complete code

import requests
import re
import json

class WenKu():
    def __init__(self):
        self.session = requests.Session()

    def get_url(self):
        url = input(" Please enter the downloaded Library URL Address ：")
        headers = {
    
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'wenku.baidu.com',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'same-origin',
            'Sec-Fetch-User': '?1',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
        }
        response = self.session.get(url=url,headers=headers)

        json_data = re.findall('"json":(.*?}])', response.text)[0]
        json_data = json.loads(json_data)
        # print(json_data)
        for index, page_load_urls in enumerate(json_data):
            # print(page_load_urls)
            page_load_url = page_load_urls['pageLoadUrl']
            # print(index)
            self.get_data(index, page_load_url)

    def get_data(self, index, url):
        headers = {
    
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'wkbjcloudbos.bdimg.com',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
        }
        response = self.session.get(url=url,headers=headers)
        # print(response.content.decode('unicode_escape'))
        data = response.content.decode('unicode_escape')
        comand = 'wenku_' + str(index+1)
        json_data = re.findall(comand + "\((.*?}})\)", data)[0]
        # print(json_data)
        json_data = json.loads(json_data)
        result = []
        for i in json_data['body']:
            data = i["c"]
            # print(data)
            result.append(data)

        print(''.join(result).replace(' ', '\n'))
        print("")
        with open(' Baidu library .docx', 'a', encoding='utf-8') as f:
            f.write('')
            f.write(''.join(result).replace(' ', '\n'))

if __name__ == '__main__':
    wk = WenKu()
    wk.get_url()