当前位置:网站首页>Scratch program learning
Scratch program learning
2022-06-26 07:05:00 【Rorschach379】
douban.py
scrapy genspider douban movie.douban.com
scrapy genspider taobao www.taobao.com
import scrapy
from scrapy import Selector, Request
from scrapy.http import HtmlResponse
from spider2107.items import MovieItem
class DoubanSpider(scrapy.Spider):
name = 'douban'
allowed_domains = ['movie.douban.com']
def start_requests(self):
for page in range(10):
yield Request(url=f'https://movie.douban.com/top250?start={
page * 25}&filter=')
def parse(self, response: HtmlResponse):
sel = Selector(response)
lis = sel.css('#content > div > div.article > ol > li')
for li in lis: # type:Selector
movie_item = MovieItem()
movie_item['title'] = li.css('span.title::text').extract_first()
movie_item['rank'] = li.css('span.rating_num::text').extract_first()
movie_item['subject'] = li.css('span.inq::text').extract_first()
yield movie_item
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class MovieItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
rank = scrapy.Field()
subject = scrapy.Field()
settings.py
# Scrapy settings for spider2107 project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'spider2107'
SPIDER_MODULES = ['spider2107.spiders']
NEWSPIDER_MODULE = 'spider2107.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 2
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'spider2107.middlewares.Spider2107SpiderMiddleware': 543,
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'spider2107.middlewares.Spider2107DownloaderMiddleware': 543,
# }
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'spider2107.pipelines.DbPipeline': 200,
'spider2107.pipelines.Spider2107Pipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Save method
scrapy crawl douban -o douban.json
scrapy crawl douban -o douban.scv
scrapy crawl douban -o douban.xml
pip freeze ( Query dependency list )
pipelines.py
# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
import openpyxl
import pymysql
class DbPipeline:
def __init__(self):
self.conn = pymysql.connect(host='localhost', port=3306,
user='root', password='Abc123!!',
database='hrs', charset='utf8mb4')
self.cursor = self.conn.cursor()
def close_spider(self, spider):
self.conn.commit()
self.conn.close()
def process_item(self, item, spider):
title = item.get('title', '')
rank = item.get('rank', 0)
subject = item.get('subject', '')
self.cursor.execute(
'insert into tb_movie (title ,rating, subject)'
'values (%s, %s,%s)',
(title, rank, subject)
)
return item
class Spider2107Pipeline:
def __init__(self):
self.wb = openpyxl.Workbook()
self.ws = self.wb.active
self.ws.title = 'Top250'
self.ws.append((' title ', ' score ', ' The theme '))
def close_spider(self, spider):
self.wb.save(' Movie data .xlsx')
def process_item(self, item, spider):
title = item.get('title', '')
rank = item.get('rank', '')
subject = item.get('subject', '')
self.ws.append((title, rank, subject))
return item
Save method
scrapy crawl douban
边栏推荐
- . Net 20th anniversary! Microsoft sends a document to celebrate
- QTreeWidget And QTableWidget
- Procedure macros in rust
- Fmt Must the result of println (true) be true?
- 【路径规划】基于改进人工势场实现机器人路径规划附matlab代码
- Redis系列——5种常见数据类型day1-3
- Guide to "avoid dismissal during probation period"
- Shell programming - user information management
- 炒股怎么选择证券公司?手机开户安全么?
- i3wm 获取window class
猜你喜欢

ZRaQnHYDAe

Alkynyl crosslinked porphyrin based polyimide materials (ppbpi-h-cr, ppbpi Mn cr.ppbpi Fe Cr); Metalloporphyrin based polyimide (ppbpi Mn, ppbpi FE) supplied by Qiyue

Oracle中计算除法——解决除数为零报错
![Meso tetra (4-bromophenyl) porphyrin (tbpp); 5,10,15,20-tetra (4-methoxy-3-sulfonylphenyl) porphyrin [t (4-mop) ps4] supplied by Qiyue](/img/83/ddbf296ac83f006f31cfd0bbbabe5e.jpg)
Meso tetra (4-bromophenyl) porphyrin (tbpp); 5,10,15,20-tetra (4-methoxy-3-sulfonylphenyl) porphyrin [t (4-mop) ps4] supplied by Qiyue

Massive log collection tool flume

In depth analysis of redis object structure

MySQL基础用法01

ZRaQnHYDAe

LabVIEW Arduino tcp/ip remote smart home system (project part-5)

Ppbpi-h-cr, ppbpimn Cr, ppbpi Fe Cr alkynyl crosslinked porphyrin based polyimide material Qiyue porphyrin reagent
随机推荐
ZRaQnHYDAe
PyTorch搭建CNN-LSTM混合模型实现多变量多步长时间序列预测(负荷预测)
[feature extraction] feature selection of target recognition information based on sparse PCA with Matlab source code
Interviewer: what is the difference between a test plan and a test plan?
INSERT IGNORE 与INSERT INTO的区别
【图像分割】基于最大主曲率实现视网膜眼底图像中的血管提取附matlab代码
炒股怎么选择证券公司?手机开户安全么?
oracle创建带返回值的存储过程并sql执行调用
China imported wine circulation and investment market survey and Future Development Trend Outlook report 2022-2027
What is deadlock
China's audio industry competition trend outlook and future development trend forecast report 2022 Edition
Analysis report on market demand and investment competitiveness of China's cyclohexanone industry (2022 Edition)
I have been testing at Tencent for several years
Professional course - Code question record
专业课-代码题记录
Spark3.3.0源码编译补充篇-抓狂的证书问题
Redis系列——5种常见数据类型day1-3
解决dialog 底部透明的问题
Pytorch builds CNN LSTM hybrid model to realize multivariable and multi step time series forecasting (load forecasting)
MySQL