当前位置:网站首页>Project summary --2 (basic use of jsup)
Project summary --2 (basic use of jsup)
2022-07-03 06:06:00 【Programmer DD】
Crawl the provincial and municipal level data in the web page
1. In the process of three-level linkage between provinces and cities , You need to crawl all the data of the corresponding provincial administration , For the front desk AJAX call .
Due to business requirements , You need to let users choose address information . So I want to find the latest data of provinces, cities and towns on the Internet , The data source is the current address
2021 The division code and urban-rural division code are used in the annual statistics
1. Introduce org.jsoup
jsoup It's a Java Of html Parser
2.Maven rely on
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.9.2</version>
</dependency>
3. Write entity class , For storing data
RegionEntry.java
package cn.jiangdoc.utils;
import java.util.ArrayList;
import java.util.List;
public class RegionEntry {
private String code;
private String name;
private List<RegionEntry> sub = new ArrayList<>();
public String getCode() {
return code;
}
public void setCode(String code) {
this.code = code;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public List<RegionEntry> getSub() {
return sub;
}
public void setSub(List<RegionEntry> sub) {
this.sub = sub;
}
public RegionEntry(String code, String name, List<RegionEntry> sub) {
this.code = code;
this.name = name;
this.sub = sub;
}
public RegionEntry() {
}
}
4. Officially start our crawler data
AddressData .java
package cn.jiangdoc.utils;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* Grab
*
* @author jiangdoc
* @date 2019-3-16
*/
public class AddressData {
public static String SITE_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/";
private static List<RegionEntry> regions = new ArrayList<RegionEntry>();
public static void main(String[] args) {
System.out.println(" Grab start :" + new Date());
getProvince();
StringBuffer content = new StringBuffer();
for (RegionEntry one : regions) {
content.append("insert into sys_province values(null,'").append(one.getCode()).append("', '").append(one.getName()).append("', 1 );\r\n");
for (RegionEntry two : one.getSub()) {
content.append("insert into sys_city values(null,'").append(one.getCode()).append("', '").append(two.getCode()+"','").append(two.getName()).append("', 2);\r\n");
for (RegionEntry three : two.getSub()) {
content.append("insert into sys_county values(null,'").append(one.getCode()).append("', '").append(two.getCode()).append("', '").append(three.getCode()).append("', '").append(three.getName()).append("', 3 );\r\n");
for(RegionEntry four:three.getSub()){
content.append("insert into sys_town values(null,'").append(one.getCode()).append("', '").append(two.getCode()).append("', '").append(three.getCode()).append("', '").append(four.getCode()).append("','").append(four.getName()).append("', 4 );\r\n");
}
}
}
}
FileOutputStream out = null;
// Region.writeFile(content.toString());
try{
out = new FileOutputStream(new File("G:\\log\\city.txt"));
byte[] bytes = content.toString().getBytes();
out.write(bytes);
out.flush();
}catch(Exception e){
e.printStackTrace();
}finally{
if(out!=null)
try{
out.close();
}catch (Exception e){
e.printStackTrace();
}
}
System.out.println(" After grabbing :" + new Date());
}
private static void getProvince() {
Document doc;
try {
doc = Jsoup.connect(SITE_URL).get(); //Jsoup.connect(SITE_URL).get();
Elements links = doc.select("tr.provincetr").select("a");
RegionEntry region = null;
for (Element e : links) {
region = new RegionEntry();
String href = e.attr("href");
String[] arr = href.split("\\.");
String code = arr[0];
if (arr[0].length() < 6) {
for (int i = 0; i < 6 - arr[0].length(); i++) {
code += "0";
}
}
region.setCode(code);
region.setName(e.text());
// href Jedi path
String absHref = e.attr("abs:href");
System.out.println(absHref);
getCity(absHref, region);
regions.add(region);
try {
Thread.sleep(1000);
} catch (InterruptedException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Get the city address
* @param url
* @param region
*/
private static void getCity(String url, RegionEntry region) {
Document doc;
try {
doc = Jsoup.connect(url).get(); //Jsoup.connect(url).get().charset(charset);
// <tr class='citytr'><td><a href='65/6501.html'>650100000000</a></td><td><a href='65/6501.html'> Urumqi </a></td></tr>
Elements links = doc.select("tr.citytr");
RegionEntry city;
for (Element e : links) {
city = new RegionEntry();
Elements alist = e.select("a");
Element codeE = alist.get(0);
Element codeN = alist.get(1);
String name = codeN.text();
String code = codeE.text();
if (" Municipal district ".equals(name)) {
name = region.getName();
//code = region.getCode();
}
city.setCode(code);
city.setName(name);
String absHref = codeE.attr("abs:href");
getArea(absHref, city);
region.getSub().add(city);
}
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Get the District address
* @param url
* @param region
*/
private static void getArea(String url, RegionEntry region) {
Document doc;
try {
doc = Jsoup.connect(url).get(); // Jsoup.connect(url).get();
//<tr class='countytr'><td><a href='01/130102.html'>130102000000</a></td><td><a href='01/130102.html'> Chang'an District </a></td></tr>
Elements links = doc.select("tr.countytr");
RegionEntry area;
for (Element e : links) {
area = new RegionEntry();
Elements alist = e.select("a");
if (alist.size() > 0) {
Element codeE = alist.get(0);
String code = codeE.text();
area.setCode(code);
Element codeN = alist.get(1);
String name = codeN.text();
area.setName(name);
String absHref = codeE.attr("abs:href");
getTown(absHref, area);
region.getSub().add(area);
} else {
alist = e.select("td");
area.setCode(alist.get(0).text());
area.setName(alist.get(1).text());
region.getSub().add(area);
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
// Township
private static void getTown(String url, RegionEntry region) {
Document doc;
try {
doc = Jsoup.connect(url).get(); // Jsoup.connect(url).get();
//<tr class='towntr'><td><a href='07/110107001.html'>110107001000</a></td><td><a href='07/110107001.html'> Babaoshan sub district office </a></td></tr>
Elements links = doc.select("tr.towntr");
RegionEntry town;
for (Element e : links) {
town = new RegionEntry();
Elements alist = e.select("a");
if (alist.size() > 0) {
Element codeE = alist.get(0);
String code = codeE.text();
town.setCode(code);
Element codeN = alist.get(1);
String name = codeN.text();
town.setName(name);
region.getSub().add(town);
} else {
alist = e.select("td");
town.setCode(alist.get(0).text());
town.setName(alist.get(1).text());
region.getSub().add(town);
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
The main thing is that there may be crawling timeout , Make a proposal to Thread.sleep Set the time of a little longer
Or add below , Data verification may be required later , Probably climb down the data SQL In about 5,6MB Both the left and right basic data are complete
Connection connect = Jsoup.connect("url");
Map<String> header = new HashMap<String>();
header.put("Content-type", "application/json");
Connection data = connect.headers(header);
Document document = data.get();[code=java]Provincial and urban background interface settings
This crawling setting requires relevant judgments based on the information of the corresponding data source , When making relevant settings , You need to provide the corresponding interface, which is actually the main interface ID The value transfer of is still processed in the foreground , The front desk JS When processing, you need the corresponding parent ID Come in , Backstage reception ID Just query directly , The overall business is very simple .

1.Entity Entity class design
@Data
@Entity
@Table(name = "tb_area")
@NoArgsConstructor
@EntityListeners(AuditingEntityListener.class)
public class Area {
@Id
@Column(columnDefinition = "varchar(32) comment ' Primary key id' ")
private String id;
@Column(columnDefinition = "varchar(32) comment ' Parent id' ")
private String parentId;
@Column(columnDefinition = "varchar(32) comment ' code ' ")
private String code;
@Column(columnDefinition = "varchar(32) comment ' name ' ")
private String name;
@Column(columnDefinition = "varchar(32) comment ' Grade ' ")
private String level;
@Column(columnDefinition = "datetime comment ' Creation time ' ")
@CreatedDate
private LocalDateTime createTime;
@Column(columnDefinition = "datetime comment ' Update time ' ")
@LastModifiedDate
private LocalDateTime updateTime;
@Column(columnDefinition = "tinyint(1) comment ' Whether or not to delete ' ")
private Boolean delFlag = false;2.Dao(AreaRepository) Database persistence layer settings
public interface AreaRepository extends JpaRepository<Area, String>, JpaSpecificationExecutor<Area> {
List<AreaDTO> findAllByLevelAndDelFlagIsFalse(String level);
List<AreaDTO> findAllByParentIdAndDelFlagIsFalse(String parentId);
List<AreaDTO> findAllByCodeAndDelFlagIsFalse(String code);
}
3.Service( The business layer deals with ) Are based on the corresponding DAO There is not too much complex logic for direct query
@Service
public class AreaService {
@Autowired
private AreaRepository areaRepository;
@Cacheable(cacheNames = "level:info")
public List<AreaDTO> findProvinceList(String level) {
return areaRepository.findAllByLevelAndDelFlagIsFalse(level);
}
public List<AreaDTO> findByCodeList(String code) {
return areaRepository.findAllByCodeAndDelFlagIsFalse(code);
}
@Cacheable(cacheNames = "provinceId:info")
public List<AreaDTO> findCityList(String provinceId) {
return areaRepository.findAllByParentIdAndDelFlagIsFalse(provinceId);
}
@Cacheable(cacheNames = "cityId:info")
public List<AreaDTO> findOrganyList(String cityId) {
return areaRepository.findAllByParentIdAndDelFlagIsFalse(cityId);
}
@Cacheable(cacheNames = "organId:info")
public List<AreaDTO> findStreetList(String organId) {
return areaRepository.findAllByParentIdAndDelFlagIsFalse(organId);
}4.Controller ( Control layer ) Directly through different API Call to perform display processing
@Api(tags = " Address related ")
@RestController
@RequestMapping("/area")
public class AreaController {
@Autowired
private AreaService areaService;
@ApiOperation(" Get Provincial Information ")
@GetMapping(value = "/province")
public ObjectResponse<List<AreaDTO>> getProvince(@ApiParam(value = " Administrative grade ") @RequestParam(value = "level", defaultValue = "1", required = false) String level) {
return ObjectResponse.newResponse(areaService.findProvinceList(level));
}
@ApiOperation(" Get city information ")
@GetMapping("/city")
public ObjectResponse<List<AreaDTO>> getCity(@ApiParam(value = " provincial level ID")@RequestParam(value = "province_id") String provinceId) {
return ObjectResponse.newResponse(areaService.findCityList(provinceId));
}
@ApiOperation(" Get zone information ")
@GetMapping("/organ")
public ObjectResponse<List<AreaDTO>> getOrgan(@ApiParam(value = " Municipal level ID") @RequestParam(value = "city_id") String cityId) {
return ObjectResponse.newResponse(areaService.findOrganyList(cityId));
}
@ApiOperation(" Get street information ")
@GetMapping("/street")
public ObjectResponse<List<AreaDTO>> getStreet(@ApiParam(value = " District level ID") @RequestParam(value = "organ_id") String organId) {
return ObjectResponse.newResponse(areaService.findStreetList(organId));
}
As shown above , The information of provincial and urban streets can be linked , During the operation , In fact, the core is to capture the corresponding address , Only when you have the address can you determine that the only data can be pulled through the program .
There is a place that needs attention , In the place of the Bureau of statistics, there will be a concept of three-level address in the municipal district , In fact, this address does not really exist , Just for the convenience of statistics and use . For example, this black city district , He has no subordinate streets below , But for the convenience of coding, we still need to carry out statistical operations on this


Common modules -- Page search ; Check the details ; preservation ; Interface
1. In the management system , Common modules , Paging query , And query the details of this record , Modify the information , Save information ; The following are analyzed from top to bottom ;
Controller layer ( Link request analysis )

preservation / Modification can share a new interface , When operating, you can distinguish and save details first findByID Query. If the current record does not exist, save it directly , If it exists, modify it directly
---->Controller Preservation of layers / This is the way to modify URI
@ApiOperation(" Kept in hospital / modify ")
@PostMapping()
public ObjectResponse<Void> save(@Valid @RequestBody HospitalInputDTO hospitalInputDTO) {
hospitalService.save(hospitalInputDTO);
return ObjectResponse.messageResponse(" Saved successfully ");
}
---->Service Preservation of layers / Modify the target path of the request
public void save(HospitalInputDTO inputDTO) {
Hospital hospital;
if (StringUtils.isNotEmpty(inputDTO.getId())) {
hospital = hospitalRepository.findById(inputDTO.getId()).orElseThrow(() -> new BaseException(" The hospital does not exist !"));
} else {
hospital = new Hospital();
hospital.setId(RandomGenerator.buildUUID());
}
hospital.setName(inputDTO.getName());
hospital.setBigLogo(inputDTO.getBigLogo());
hospital.setSmallLogo(inputDTO.getSmallLogo());
hospital.setLevel(inputDTO.getLevel());
hospital.setSampleFee(inputDTO.getSampleFee());
// The default registration fee is 0
hospital.setRegFee("0");
hospital.setDelFlag(Boolean.FALSE);
hospitalRepository.save(hospital);
}
When performing modification , By default, the front end will send the corresponding data DTO Stuffed inside ID, And the corresponding modified fields are backfilled to the corresponding information content , Fill in the corresponding details , Writing back , Save operation , During the operation, the corresponding ID Information , So we need to assign a new value ID operation , During the whole preservation ID All operations in the table cannot be self incremented .
Paging query / Query a detailed record , Paging query is to combine the corresponding query criteria to perform the query operation , So the overall operation is still very simple , In combination with the corresponding query, the query of a commodity details is directly based on ID The query of information is very simple .
---->Controller Paged query of layer
@ApiOperation(" List of hospitals ")
@GetMapping("/page")
public PageResponse<HospitalOutputDTO> page(@ApiIgnore Session session,
@ApiParam(value = " Hospital name ") @RequestParam(value = "name", required = false) String name,
@RequestParam(value = "page", defaultValue = "1", required = false) int page,
@RequestParam(value = "size", defaultValue = "20", required = false) int size) {
Page<HospitalOutputDTO> hospitalOutputDTOPage = hospitalService.page(session, name, page, size);
return PageResponse.newResponse(hospitalOutputDTOPage.getContent(), (int)hospitalOutputDTOPage.getTotalElements(), page, size);
}
--->Controller Query the details of a record in the layer
@ApiOperation(" Hospital details ")
@GetMapping("/detail")
public ObjectResponse<HospitalDTO> detail(@ApiParam(" Hospital primary key id") @RequestParam(value = "id") String id) {
HospitalDTO hospitalDTO = hospitalService.detail(id);
return ObjectResponse.newResponse(hospitalDTO);
}
---->Service Layer paging query , It involves dynamic condition query , Just copy directly
public Page<HospitalOutputDTO> page(Session session, String name, int page, int size) {
HospitalDTO query = new HospitalDTO();
query.setName(name);
Specification<Hospital> specification = querySpecification(query);
Pageable pageable = PageRequest.of(page - 1, size, Sort.by(Sort.Direction.DESC, "updateTime"));
Page<Hospital> antigenPage = hospitalRepository.findAll(specification, pageable);
List<HospitalOutputDTO> collect = antigenPage.stream().map(HospitalOutputDTO::new).collect(Collectors.toList());
return new PageImpl<>(collect, pageable, antigenPage.getTotalElements());
}
private Specification<Hospital> querySpecification(HospitalDTO query) {
return (root, criteriaQuery, criteriaBuilder) -> {
List<Predicate> predicateList = new ArrayList<>();
if (StringUtils.isNotEmpty(query.getName())) {
predicateList.add(criteriaBuilder.like(root.get("name").as(String.class), "%" + query.getName() + "%"));
}
predicateList.add(criteriaBuilder.equal(root.get("delFlag").as(Boolean.class), Boolean.FALSE));
return criteriaBuilder.and(predicateList.toArray(new Predicate[0]));
};
}
---->Servic Query the details of a record in the layer , Make a record directly findById Operation is OK
public HospitalDTO detail(String id) {
Hospital hospital = hospitalRepository.findById(id).orElseThrow(() -> new BaseException(" The hospital does not exist !"));
return new HospitalDTO(hospital);
}
The resource address is shown in the following figure :https://download.csdn.net/download/zgz102928/85236645
边栏推荐
- 从小数据量 MySQL 迁移数据到 TiDB
- SVN分支管理
- Use telnet to check whether the port corresponding to the IP is open
- [teacher Zhao Yuqiang] index in mongodb (Part 2)
- [teacher Zhao Yuqiang] index in mongodb (Part 1)
- [advanced pointer (2)] | [function pointer, function pointer array, callback function] key analysis + code explanation
- @Import annotation: four ways to import configuration classes & source code analysis
- JS implements the problem of closing the current child window and refreshing the parent window
- When PHP uses env to obtain file parameters, it gets strings
- C 语言文件操作函数大全 (超详细)
猜你喜欢
![[video of Teacher Zhao Yuqiang's speech on wot] redis high performance cache and persistence](/img/a7/2140744ebad9f1dc0a609254cc618e.jpg)
[video of Teacher Zhao Yuqiang's speech on wot] redis high performance cache and persistence
![[teacher Zhao Yuqiang] MySQL high availability architecture: MHA](/img/a7/2140744ebad9f1dc0a609254cc618e.jpg)
[teacher Zhao Yuqiang] MySQL high availability architecture: MHA

Cesium 点击获三维坐标(经纬度高程)
![[teacher Zhao Yuqiang] use Oracle's tracking file](/img/0e/698478876d0dbfb37904d7b9ff9aca.jpg)
[teacher Zhao Yuqiang] use Oracle's tracking file

Life is a process of continuous learning

Code generator - single table query crud - generator

pytorch 搭建神经网络最简版

卷积神经网络CNN中的卷积操作详解
![[teacher Zhao Yuqiang] the most detailed introduction to PostgreSQL architecture in history](/img/18/f91d3d21a39743231d01f2e4015ef8.jpg)
[teacher Zhao Yuqiang] the most detailed introduction to PostgreSQL architecture in history
![[set theory] relational closure (reflexive closure | symmetric closure | transitive closure)](/img/c8/2995c503e9dabae4e2cc704449e04f.jpg)
[set theory] relational closure (reflexive closure | symmetric closure | transitive closure)
随机推荐
Detailed explanation of findloadedclass
Loss function in pytorch multi classification
[teacher Zhao Yuqiang] MySQL high availability architecture: MHA
88. 合并两个有序数组
深度学习,从一维特性输入到多维特征输入引发的思考
Kubernetes notes (V) configuration management
Understand the first prediction stage of yolov1
Merge and migrate data from small data volume, sub database and sub table Mysql to tidb
Get a screenshot of a uiscrollview, including off screen parts
Virtual memory technology sharing
GPS坐标转百度地图坐标的方法
[teacher Zhao Yuqiang] use Oracle's tracking file
Btrfs and ext4 - features, strengths and weaknesses
Simple handwritten ORM framework
1. Sum of two numbers
Cesium 点击获三维坐标(经纬度高程)
JDBC connection database steps
最大似然估计,散度,交叉熵
Using the ethtool command by example
When PHP uses env to obtain file parameters, it gets strings