当前位置:网站首页>Web crawler knowledge day04
Web crawler knowledge day04
2022-06-29 03:47:00 【Young Chen Gong】
One 、 encapsulation HttpClient
We need to use it often HttpClient, So it needs to be packaged , Easy to use
@Component
public class HttpUtils {
private PoolingHttpClientConnectionManager cm;
public HttpUtils() {
this.cm = new PoolingHttpClientConnectionManager();
// Set the maximum number of connections
cm.setMaxTotal(200);
// Set the number of concurrencies per host
cm.setDefaultMaxPerRoute(20);
}
// Get content
public String getHtml(String url) {
// obtain HttpClient object
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
// Statement httpGet Request object
HttpGet httpGet = new HttpGet(url);
// Set request parameters RequestConfig
httpGet.setConfig(this.getConfig());
CloseableHttpResponse response = null;
try {
// Use HttpClient Initiate request , return response
response = httpClient.execute(httpGet);
// analysis response Return the data
if (response.getStatusLine().getStatusCode() == 200) {
String html = "";
// If response.getEntity The result is empty , In execution EntityUtils.toString Will report a mistake
// Need to be right Entity Make non empty judgments
if (response.getEntity() != null) {
html = EntityUtils.toString(response.getEntity(), "UTF-8");
}
return html;
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (response != null) {
// Close the connection
response.close();
}
// Can not close , Now we're using the connection manager
// httpClient.close();
} catch (Exception e) {
e.printStackTrace();
}
}
return null;
}
// Get photo
public String getImage(String url) {
// obtain HttpClient object
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
// Statement httpGet Request object
HttpGet httpGet = new HttpGet(url);
// Set request parameters RequestConfig
httpGet.setConfig(this.getConfig());
CloseableHttpResponse response = null;
try {
// Use HttpClient Initiate request , return response
response = httpClient.execute(httpGet);
// analysis response Download the pictures
if (response.getStatusLine().getStatusCode() == 200) {
// Get file type
String extName = url.substring(url.lastIndexOf("."));
// Use uuid Generate image name
String imageName = UUID.randomUUID().toString() + extName;
// Declare the output file
OutputStream outstream = new FileOutputStream(new File("D:/images/" + imageName));
// Use the response body output file
response.getEntity().writeTo(outstream);
// Returns the generated image name
return imageName;
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (response != null) {
// Close the connection
response.close();
}
// Can not close , Now we're using the connection manager
// httpClient.close();
} catch (Exception e) {
e.printStackTrace();
}
}
return null;
}
// Get the request parameter object
private RequestConfig getConfig() {
RequestConfig config = RequestConfig.custom().setConnectTimeout(1000)// Set the timeout for creating a connection
.setConnectionRequestTimeout(500) // Set the timeout for getting the connection
.setSocketTimeout(10000) // Set the timeout for the connection
.build();
return config;
}
}
Two 、 Achieve data capture
Using scheduled tasks , It can capture the latest data regularly
@Component
public class ItemTask {
@Autowired
private HttpUtils httpUtils;
@Autowired
private ItemService itemService;
public static final ObjectMapper MAPPER = new ObjectMapper();
// Set the time when the task is finished , Re interval 100 Once per second
@Scheduled(fixedDelay = 1000 * 100)
public void process() throws Exception {
// Analyze the page and find the address to visit , Page number page from 1 Start , The next page oage Add 2
String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&cid2=653&cid3=655&s=5760&click=0&page=";
// Traverse the execution , Get all the data
for (int i = 1; i < 10; i = i + 2) {
// Initiate a request for access , Get page data , First visit the first page
String html = this.httpUtils.getHtml(url + i);
// Parsing page data , Save data to database
this.parseHtml(html);
}
System.out.println(" Execution completed ");
}
// Parsing the page , And save the data to the database
private void parseHtml(String html) throws Exception {
// Use jsoup Parsing the page
Document document = Jsoup.parse(html);
// Get product data
Elements spus = document.select("div#J_goodsList > ul > li");
// Traverse the merchandise spu data
for (Element spuEle : spus) {
// Access to goods spu
Long spuId = Long.parseLong(spuEle.attr("data-spu"));
// Access to goods sku data
Elements skus = spuEle.select("li.ps-item img");
for (Element skuEle : skus) {
// Access to goods sku
Long skuId = Long.parseLong(skuEle.attr("data-sku"));
// Judge whether the goods have been seized , According to sku Judge
Item param = new Item();
param.setSku(skuId);
List<Item> list = this.itemService.findAll(param);
// Judge whether the result is found
if (list.size() > 0) {
// If there is a result , Indicates that the product has been downloaded , Do the next traversal
continue;
}
// Save product data , Declare the commodity object
Item item = new Item();
// goods spu
item.setSpu(spuId);
// goods sku
item.setSku(skuId);
// goods url Address
item.setUrl("https://item.jd.com/" + skuId + ".html");
// Creation time
item.setCreated(new Date());
// Modification time
item.setUpdated(item.getCreated());
// Get the product title
String itemHtml = this.httpUtils.getHtml(item.getUrl());
String title = Jsoup.parse(itemHtml).select("div.sku-name").text();
item.setTitle(title);
// Get commodity prices
String priceUrl = "https://p.3.cn/prices/mgets?skuIds=J_"+skuId;
String priceJson = this.httpUtils.getHtml(priceUrl);
// analysis json Data access to commodity prices
double price = MAPPER.readTree(priceJson).get(0).get("p").asDouble();
item.setPrice(price);
// Get image address
String pic = "https:" + skuEle.attr("data-lazy-img").replace("/n9/","/n1/");
System.out.println(pic);
// Download the pictures
String picName = this.httpUtils.getImage(pic);
item.setPic(picName);
// Save product data
this.itemService.save(item);
}
}
}
}
边栏推荐
猜你喜欢

SQL performance optimization is really eye popping

19.03 vessel description and simple application examples continued
![二叉树的锯齿形层序遍历[分层遍历方式之一 -> 前序遍历+level]](/img/f6/0df9f2a454cea0a95a5347546a90fb.png)
二叉树的锯齿形层序遍历[分层遍历方式之一 -> 前序遍历+level]

88.(cesium篇)cesium聚合图

【TcaplusDB知识库】TcaplusDB数据导入介绍

【世界海洋日】TcaplusDB号召你一同保护海洋生物多样性

87.(cesium篇)cesium热力图(贴地形)

Grafana Getting Started tutorial

leetcode:304. 二维区域和检索 - 矩阵不可变

87. (cesium chapter) cesium thermal map (pasted with terrain)
随机推荐
go实现分布式锁
《运营之光3.0》全新上市——跨越时代,自我颠覆的诚意之作
Django model generates docx database design documents
微秒级 TCP 时间戳
[MCU framework][dfu] DFU upgrade example with CRC verification + timeout mechanism +led indicator + chip locking + chip self erasure
Data collection and management [10]
使用roslaunch为Gazebo加载自定义模型时黑屏、报错问题
高性能限流器 Guava RateLimiter
19.03 vessel description and simple application examples continued
迅为i.MX8M开发板yocto系统使用Gstarwmr视频转换
【TcaplusDB知识库】批量复制游戏区
【世界海洋日】TcaplusDB号召你一同保护海洋生物多样性
Do you feel confused when you study at three in the morning?
[tcaplusdb knowledge base] view tcapdir directory server
Vscode plug-in used now
DevOps笔记-05:IT行业中BA、SM、PO、PM、PD、Dev、Ops、QA都是什么角色
Data collection and management [5]
人大金仓(KingBase)导出表结构
Data collection and management [7]
Black screen and error reporting when loading custom models for gazebo with roslaunch