当前位置:网站首页>scrapy保存数据到excel:利用openpyxl创建多张表,设置Excel行数限制
scrapy保存数据到excel:利用openpyxl创建多张表,设置Excel行数限制
2022-06-28 23:21:00 【supramolecular】
在处理excel数据时发现了xlwt的局限性–不能写入超过65535行、256列的数据(因为它只支持Excel 2003及之前的版本,在这些版本的Excel中行数和列数有此限制)如果数据量超过65535就会遇到:ValueError: row index was 65536, not allowed by .xls format,openpyx支持07/10/13版本Excel的,最大行数达到1048576。
使用Excel导出数据,即时保存数据是一个问题,每插入一条数据,先加载,然后保存,操作次数频繁,影响爬虫效率,但是scrapy中途停止时,数据损失小,如果每一张表,数据达到限定的数据量保存,相比前一种方法,爬虫运行效率高,但是需要停止爬虫时,最后一张表的数据没有达到最大数据量时,整个表的数据会丢失,虽然使用了关闭爬虫是保存文件,或异常时保存文件,但是没有效果,后续有时间再继续探索
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from pymongo import MongoClient
from scrapy import Item
from .items import QuestionItem, AnswerItem
from openpyxl.workbook import Workbook
from openpyxl import load_workbook
import os
import random
import string
class BaiduZhidaoPipeline(object):
def process_item(self, item, spider):
return item
class MongoDBPipeline(object):
def open_spider(self, spider):
db_uri = spider.settings.get('MONGODB_URI', 'mongodb://localhost:27017')
db_name = spider.settings.get('MONGODB_DB_NAME', 'baiduzhidao')
self.db_client = MongoClient(db_uri)
self.db = self.db_client[db_name]
def close_spider(self, spider):
self.db_client.close()
def process_item(self, item, spider):
self.insert_db(item)
return item
def insert_db(self, item):
if isinstance(item, Item):
if isinstance(item, QuestionItem):
item = dict(item)
self.db['baiduzhidao_question'].insert_one(item)
elif isinstance(item, AnswerItem):
item = dict(item)
self.db['baiduzhidao_answer'].insert_one(item)
class ExcelPipeline(object):
def open_spider(self, spider):
##设置Excel数据条数
self.item_num_max = 65535
self.item_num = 0
self.file = ''
def close_spider(self, spider):
print("Excel数据保存结束--------------------------------------------------")
self.wb.save(self.file)
def process_item(self, item, spider):
try:
if self.item_num == 0:
self.create_excel()
elif self.item_num > self.item_num_max:
## 数据大于最大行数,保存Excel,重新创建
self.wb.save(self.file)
self.create_excel()
self.item_num = 0
# self.load_sheet()
if isinstance(item, Item):
if isinstance(item, QuestionItem):
item = dict(item)
print("Pipline-----------------------", item)
data = [item["question_id"], item["question"], item["question_all"], item["url"], " ", " ", " ",
" ", " ", " ", " ", item["create_time"]]
elif isinstance(item, AnswerItem):
item = dict(item)
print("Pipline-----------------------", item)
data = [item["question_id"], " ", " ", " ", item["answer"], item["evaluate_num"],
item["keyword_nums"], item["best_answer"], item["username"], item["published_time"],
item["answer_id"], item["create_time"]]
self.ws.append(['%s' % i for i in data])
self.item_num += 1
print(self.item_num,"数据导出--------------------------------------------------------------------")
# self.wb.save(self.file)
return item
except:
print("Excel异常结束-------------------------------------------")
self.ws.save(self.file)
return item
def create_excel(self):
self.file = ''.join(random.sample(string.ascii_letters + string.digits, 8)) + ".xlsx"
item = [u'问题id', u'问题', '答案个数', u'链接', u'答案排序', u'答案', u'点赞数', u'关键词出现次数', u'是否是最佳答案', u'用户名称', u'发布时间', u'答案id', u'创建时间']
self.wb = Workbook(write_only=True)
self.ws = self.wb.create_sheet()
self.ws.append(['%s' % i for i in item])
# self.wb.save(self.file)
def load_sheet(self):
self.wb = load_workbook(self.file)
self.ws = self.wb['Sheet']
创建一张表,创建多个工作簿
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from pymongo import MongoClient
from scrapy import Item
from .items import QuestionItem, AnswerItem
from openpyxl.workbook import Workbook
from openpyxl import load_workbook
import os
import random
import string
class BaiduZhidaoPipeline(object):
def process_item(self, item, spider):
return item
class MongoDBPipeline(object):
def open_spider(self, spider):
db_uri = spider.settings.get('MONGODB_URI', 'mongodb://localhost:27017')
db_name = spider.settings.get('MONGODB_DB_NAME', 'baiduzhidao')
self.db_client = MongoClient(db_uri)
self.db = self.db_client[db_name]
def close_spider(self, spider):
self.db_client.close()
def process_item(self, item, spider):
self.insert_db(item)
return item
def insert_db(self, item):
if isinstance(item, Item):
if isinstance(item, QuestionItem):
item = dict(item)
self.db['baiduzhidao_question'].insert_one(item)
elif isinstance(item, AnswerItem):
item = dict(item)
self.db['baiduzhidao_answer'].insert_one(item)
class ExcelPipeline(object):
def open_spider(self, spider):
##设置Excel数据条数
self.item_num_max = 20
self.item_num = 0
self.file = ''
def close_spider(self, spider):
print("数据保存结束--------------------------------------------------")
self.wb.save(self.file)
def process_item(self, item, spider):
try:
if self.item_num == 0:
self.create_excel()
elif self.item_num > self.item_num_max:
## 数据大于最大行数,保存Excel,重新创建
self.wb.save(self.file)
self.create_excel()
self.item_num = 0
# self.load_sheet()
if isinstance(item, Item):
if isinstance(item, QuestionItem):
item = dict(item)
print("Pipline-----------------------", item)
data = [item["question_id"], item["question"], item["question_all"], item["url"], " ", " ", " ",
" ", " ", " ", " ", item["create_time"]]
elif isinstance(item, AnswerItem):
item = dict(item)
print("Pipline-----------------------", item)
data = [item["question_id"], " ", " ", " ", item["answer"], item["evaluate_num"],
item["keyword_nums"], item["best_answer"], item["username"], item["published_time"],
item["answer_id"], item["create_time"]]
self.ws.append(['%s' % i for i in data])
self.item_num += 1
print(self.item_num,"数据导出--------------------------------------------------------------------")
# self.wb.save(self.file)
return item
except:
print("结束111111111111111")
self.ws.save(self.file)
return item
def create_excel(self):
self.file = ''.join(random.sample(string.ascii_letters + string.digits, 8)) + ".xlsx"
item = [u'问题id', u'问题', '答案个数', u'链接', u'答案排序', u'答案', u'点赞数', u'关键词出现次数', u'是否是最佳答案', u'用户名称', u'发布时间', u'答案id', u'创建时间']
self.wb = Workbook(write_only=True)
self.ws = self.wb.create_sheet()
self.ws.append(['%s' % i for i in item])
# self.wb.save(self.file)
def load_sheet(self):
self.wb = load_workbook(self.file)
self.ws = self.wb['Sheet']
边栏推荐
- 【狀態機設計】Moore、Mealy狀態機、三段式、二段式、一段式狀態機書寫規範
- CS5463代码模块解析(包含下载链接)
- TDD和自动化测试
- 关联线探究,如何连接流程图的两个节点
- [state machine design] Moore, Mealy state machine, three-stage, two-stage and one-stage state machine writing specification
- [数学建模]Matlab非线性规划之fmincon()函数
- 融云通信解决方案 破解企业沟通痛点
- 华为22级专家十年心血终成云原生服务网格进阶实战文档,是真的6
- Add the premise of ganggan
- The Best of Many Worlds_ Dual Mirror Descent for Online Allocation Problems
猜你喜欢
![[stm32 HAL库] RTC和BKP驱动](/img/72/c2c46377d0a2a5a032802640ca0201.png)
[stm32 HAL库] RTC和BKP驱动
![[matlab]函数定义与使用](/img/43/a7970ca8e075151277f7773434f7db.png)
[matlab]函数定义与使用

IDC: Alibaba cloud ranks first in the market share of China's data governance platform in 2021

第四章 存储器管理练习
![LeetCode 324 摆动排序 II[排序 双指针] HERODING的LeetCode之路](/img/41/b8ba8d771b7224eac1cc8c54fe9d29.png)
LeetCode 324 摆动排序 II[排序 双指针] HERODING的LeetCode之路
![[数学建模]Matlab非线性规划之fmincon()函数](/img/fc/46949679859b1369fcc83d0d8b637c.png)
[数学建模]Matlab非线性规划之fmincon()函数

Linq连表查询

One card can sell tens of millions, and the business expansion is faster than that of players: you don't understand the Jianghu of star cards
![[conception de la machine d'état] Moore, Mealy State Machine, Three - stage, Two - stage, one - stage State Machine Writing Specification](/img/48/e29f34aff7cc437bfb574591d54e3d.png)
[conception de la machine d'état] Moore, Mealy State Machine, Three - stage, Two - stage, one - stage State Machine Writing Specification

C interview questions_ 20220627 record
随机推荐
【SSM】报错 Access denied for user ‘WYF‘@‘localhost‘ (using password: YES) 数据的用户名变成了电脑的用户名
Non scientific class! The road of self-study!
Lost in cloud computing
I can't sleep
Thinking about lever
[conception de la machine d'état] Moore, Mealy State Machine, Three - stage, Two - stage, one - stage State Machine Writing Specification
[stm32 HAL库] 串口通信
Is it safe to open a stock account online?
SqlServer复习
Tanghongbin, Yaya live CTO: to truly localize, the product should not have the attribute of "origin"
老家出资,俞敏洪设立两支基金
第四章 存储器管理练习
Didn't find an internship. He summed it up
【状态机设计】Moore、Mealy状态机、三段式、二段式、一段式状态机书写规范
keil工程,程序写多后,RTT不能打印
[flutter issues Series title 71] Mutual Conversion between uint8list and Image in flutter
Online sql to htmltable tool
Is the account opening of Guosheng securities really safe and reliable
That's how he did it!
Prometeus 2.36.0 new features