当前位置:网站首页>Crawl Douban to read top250 and import it into SqList database (or excel table)
Crawl Douban to read top250 and import it into SqList database (or excel table)
2022-06-26 18:12:00 【Little fox dreams of going to fairy tale town】
Climb and take Douban to read Top250, Import sqlist database ( or excel form ) in
For source code, please visit https://github.com/zhang020801/douban_bookTop250
One 、 Program source code
import re # Regular expressions
from bs4 import BeautifulSoup # Extract the data
import urllib.request,urllib.error # Request access to web page , Return to the web page source code
import xlwt # Save data to excel form
import sqlite3 # Save data to sqlist In the database
def main():
# Declare which pages to crawl
baseurl = "https://book.douban.com/top250?start="
# get data
datalist = getData(baseurl)
#print(datalist)
# Save the data
#savepath = " Douban studies Top250.xls"
dbpath = "book.db"
#saveData(datalist,savepath)
saveData2(datalist,dbpath)
# Regular expressions
findlink = re.compile(r'<a href="(.*?)" οnclick=".*?" title=".*?">') # Book links
findtitle = re.compile(r'<a href=".*?" οnclick=".*?" title="(.*?)">') # Book name
findimglink = re.compile(r'<img src="(.*?)" width="90"/>') # Cover link
findauthor = re.compile(r'<p class="pl">(.*?) / (.*?) / .*? / .*?/.*?</p>') # author / translator
findpress = re.compile(r'<p class="pl">.*? / .*? / (.*?) / .*?/.*?</p>') # Press.
findtime = re.compile(r'<p class="pl">.*? / .*? / .*? / (.*?) / .*?</p>') # Publication date
findmoney = re.compile(r'<p class="pl">.*? / .*? / .*? / .*? / (.*?)</p>') # Book price
findscore = re.compile(r'<span class="rating_nums">(.*?)</span>') # score
findpeople = re.compile(r'<span class="pl">.*?(.*?) People comment on .*?</span>',re.S) # Number of evaluators
findjieshao = re.compile(r'<span class="inq">(.*?)</span>') # Introduce
def getData(baseurl):
datalist = []
for i in range(0,10):
url = baseurl + str(i*25)
html = askURL(url)
#print(html)
soup = BeautifulSoup(html,"html.parser")
for item in soup.find_all('table',width="100%"):
item = str(item) # convert to str Format
#print(item)
data = []
title = re.findall(findtitle, item)[0]
#print(title)
data.append(title)
score = re.findall(findscore,item)[0]
#print(score)
data.append(score)
link = re.findall(findlink,item)[0]
#print(link)
data.append(link)
imglink = re.findall(findimglink,item)[0]
#print(imglink)
data.append(imglink)
author = re.findall(findauthor,item)
if len(author)==0:
author = re.findall(r'<p class="pl">(.*?) / .*? / .*?</p>',item)
author = author[0]
#print(author)
data.append(author)
press = re.findall(findpress,item)
if len(press)==0:
press = re.findall(r'<p class="pl">.*? / (.*?) / .*? / .*?</p>',item)
if len(press)==0:
press = " "
else:press = press[0]
#print(press)
data.append(press)
time = re.findall(findtime,item)
if len(time)==0:
time = re.findall(r'<p class="pl">.*? / .*? / (.*?) / .*?</p>',item)
if len(time)==0:
time = " "
else:time = time[0]
#print(time)
data.append(time)
money = re.findall(findmoney,item)
if len(money)==0:
money = re.findall(r'<p class="pl">.*? / .*? / .*?/ (.*?)</p>',item)
if len(money)==0:
money = " "
else:money = money[0]
#print(money)
data.append(money)
people = re.findall(findpeople,item)
#people = people[0].replace(" ","")
people = people[0].replace("(\n ","")
#print(people)
data.append(people)
jieshao = re.findall(findjieshao,item)
if len(jieshao)==0:
jieshao = " "
jieshao = jieshao[0]
#print(jieshao)
data.append(jieshao)
datalist.append(data)
return datalist
def askURL(url):
head = {
"User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 80.0.3987.163Safari / 537.36"
}
request = urllib.request.Request(url,headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
def saveData(datalist,savepath):
print(" Start saving ...")
book = xlwt.Workbook(encoding="utf-8",style_compression=0)
sheet = book.add_sheet(' Douban studies Top250',cell_overwrite_ok=True)
col = (" Book name "," score "," Book links "," Cover image link "," author / translator "," Press. "," Publication date "," The price is "," Number of evaluators "," Brief introduction ")
for i in range(0,10):
sheet.write(0,i,col[i])
for i in range(0,250):
print(" The first %d strip "%(i+1))
data = datalist[i]
for j in range(0,10):
sheet.write(i+1,j,data[j])
book.save(savepath)
print(" Save complete ")
def saveData2(datalist,dbpath):
init_db(dbpath)
conn = sqlite3.connect(dbpath)
cur = conn.cursor()
for data in datalist:
# for index in range(len(data)):
# if index==1 or index==8:
# continue
# else:data[index] = '"' + data[index] + '"'
sql = ''' insert into book250( title,score,book_link,Img_link,author,press,time,money,num,jieshao) values ("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")'''%(data[0],data[1],data[2],data[3],data[4],data[5],data[6],data[7],data[8],data[9])
print(sql)
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
def init_db(dbpath):
sql = ''' create table book250 ( id integer primary key autoincrement, title varchar , score numeric , book_link text, Img_link text, author text, press text, time text, money text, num numeric , jieshao text ) '''
conn = sqlite3.connect(dbpath)
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
conn.close()
if __name__ =="__main__":
main()
Two 、 Program run results
1) Import data into excel In the table 

2) Import data into sqlist In the database 

边栏推荐
猜你喜欢

Dos et détails de la méthode d'attaque

Applet setting button sharing function

LeetCode 128最长连续序列

Preparing for the Blue Bridge Cup and ccf-csp

贝叶斯网络详解

RSA encryption and decryption details

Discussion and generation of digital signature and analysis of its advantages

VCD-影音光碟

IDEA收藏代码、快速打开favorites收藏窗口

In and exceptions, count (*) query optimization
随机推荐
Handwritten promise all
Li Kou daily question - day 28 -566 Reshape matrix
A little experience of next (ITER (dataloader))
比较两个对象的大小关系原来可以如此花里胡哨
ZCMU--1367: Data Structure
LM06丨仅用成交量构造抄底摸顶策略的奥秘
如何将应用加入到deviceidle 白名单?
解决pycharm里面每个字母占一格空格的问题
pycharm如何修改多行注释快捷键
Connected to surface test questions
pycharm的plt.show()如何保持不关闭
临时关闭MySQL缓存
gdb安装
I want to know. I am in Zhaoqing. Where can I open an account? Is it safe to open an account online?
软考备战多媒体系统
Number of solutions for knapsack problem
[QNX] Command
判断某个序列是否为栈的弹出序列
transforms.RandomCrop()的输入只能是PIL image 不能是tensor
交叉编译环境出现.so链接文件找不到问题