当前位置:网站首页>Project summary --2 (basic use of jsup)
Project summary --2 (basic use of jsup)
2022-07-03 06:06:00 【Programmer DD】
Crawl the provincial and municipal level data in the web page
1. In the process of three-level linkage between provinces and cities , You need to crawl all the data of the corresponding provincial administration , For the front desk AJAX call .
Due to business requirements , You need to let users choose address information . So I want to find the latest data of provinces, cities and towns on the Internet , The data source is the current address
2021 The division code and urban-rural division code are used in the annual statistics
1. Introduce org.jsoup
jsoup It's a Java Of html Parser
2.Maven rely on
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.9.2</version>
</dependency>
3. Write entity class , For storing data
RegionEntry.java
package cn.jiangdoc.utils;
import java.util.ArrayList;
import java.util.List;
public class RegionEntry {
private String code;
private String name;
private List<RegionEntry> sub = new ArrayList<>();
public String getCode() {
return code;
}
public void setCode(String code) {
this.code = code;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public List<RegionEntry> getSub() {
return sub;
}
public void setSub(List<RegionEntry> sub) {
this.sub = sub;
}
public RegionEntry(String code, String name, List<RegionEntry> sub) {
this.code = code;
this.name = name;
this.sub = sub;
}
public RegionEntry() {
}
}
4. Officially start our crawler data
AddressData .java
package cn.jiangdoc.utils;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* Grab
*
* @author jiangdoc
* @date 2019-3-16
*/
public class AddressData {
public static String SITE_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/";
private static List<RegionEntry> regions = new ArrayList<RegionEntry>();
public static void main(String[] args) {
System.out.println(" Grab start :" + new Date());
getProvince();
StringBuffer content = new StringBuffer();
for (RegionEntry one : regions) {
content.append("insert into sys_province values(null,'").append(one.getCode()).append("', '").append(one.getName()).append("', 1 );\r\n");
for (RegionEntry two : one.getSub()) {
content.append("insert into sys_city values(null,'").append(one.getCode()).append("', '").append(two.getCode()+"','").append(two.getName()).append("', 2);\r\n");
for (RegionEntry three : two.getSub()) {
content.append("insert into sys_county values(null,'").append(one.getCode()).append("', '").append(two.getCode()).append("', '").append(three.getCode()).append("', '").append(three.getName()).append("', 3 );\r\n");
for(RegionEntry four:three.getSub()){
content.append("insert into sys_town values(null,'").append(one.getCode()).append("', '").append(two.getCode()).append("', '").append(three.getCode()).append("', '").append(four.getCode()).append("','").append(four.getName()).append("', 4 );\r\n");
}
}
}
}
FileOutputStream out = null;
// Region.writeFile(content.toString());
try{
out = new FileOutputStream(new File("G:\\log\\city.txt"));
byte[] bytes = content.toString().getBytes();
out.write(bytes);
out.flush();
}catch(Exception e){
e.printStackTrace();
}finally{
if(out!=null)
try{
out.close();
}catch (Exception e){
e.printStackTrace();
}
}
System.out.println(" After grabbing :" + new Date());
}
private static void getProvince() {
Document doc;
try {
doc = Jsoup.connect(SITE_URL).get(); //Jsoup.connect(SITE_URL).get();
Elements links = doc.select("tr.provincetr").select("a");
RegionEntry region = null;
for (Element e : links) {
region = new RegionEntry();
String href = e.attr("href");
String[] arr = href.split("\\.");
String code = arr[0];
if (arr[0].length() < 6) {
for (int i = 0; i < 6 - arr[0].length(); i++) {
code += "0";
}
}
region.setCode(code);
region.setName(e.text());
// href Jedi path
String absHref = e.attr("abs:href");
System.out.println(absHref);
getCity(absHref, region);
regions.add(region);
try {
Thread.sleep(1000);
} catch (InterruptedException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Get the city address
* @param url
* @param region
*/
private static void getCity(String url, RegionEntry region) {
Document doc;
try {
doc = Jsoup.connect(url).get(); //Jsoup.connect(url).get().charset(charset);
// <tr class='citytr'><td><a href='65/6501.html'>650100000000</a></td><td><a href='65/6501.html'> Urumqi </a></td></tr>
Elements links = doc.select("tr.citytr");
RegionEntry city;
for (Element e : links) {
city = new RegionEntry();
Elements alist = e.select("a");
Element codeE = alist.get(0);
Element codeN = alist.get(1);
String name = codeN.text();
String code = codeE.text();
if (" Municipal district ".equals(name)) {
name = region.getName();
//code = region.getCode();
}
city.setCode(code);
city.setName(name);
String absHref = codeE.attr("abs:href");
getArea(absHref, city);
region.getSub().add(city);
}
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Get the District address
* @param url
* @param region
*/
private static void getArea(String url, RegionEntry region) {
Document doc;
try {
doc = Jsoup.connect(url).get(); // Jsoup.connect(url).get();
//<tr class='countytr'><td><a href='01/130102.html'>130102000000</a></td><td><a href='01/130102.html'> Chang'an District </a></td></tr>
Elements links = doc.select("tr.countytr");
RegionEntry area;
for (Element e : links) {
area = new RegionEntry();
Elements alist = e.select("a");
if (alist.size() > 0) {
Element codeE = alist.get(0);
String code = codeE.text();
area.setCode(code);
Element codeN = alist.get(1);
String name = codeN.text();
area.setName(name);
String absHref = codeE.attr("abs:href");
getTown(absHref, area);
region.getSub().add(area);
} else {
alist = e.select("td");
area.setCode(alist.get(0).text());
area.setName(alist.get(1).text());
region.getSub().add(area);
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
// Township
private static void getTown(String url, RegionEntry region) {
Document doc;
try {
doc = Jsoup.connect(url).get(); // Jsoup.connect(url).get();
//<tr class='towntr'><td><a href='07/110107001.html'>110107001000</a></td><td><a href='07/110107001.html'> Babaoshan sub district office </a></td></tr>
Elements links = doc.select("tr.towntr");
RegionEntry town;
for (Element e : links) {
town = new RegionEntry();
Elements alist = e.select("a");
if (alist.size() > 0) {
Element codeE = alist.get(0);
String code = codeE.text();
town.setCode(code);
Element codeN = alist.get(1);
String name = codeN.text();
town.setName(name);
region.getSub().add(town);
} else {
alist = e.select("td");
town.setCode(alist.get(0).text());
town.setName(alist.get(1).text());
region.getSub().add(town);
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
The main thing is that there may be crawling timeout , Make a proposal to Thread.sleep Set the time of a little longer
Or add below , Data verification may be required later , Probably climb down the data SQL In about 5,6MB Both the left and right basic data are complete
Connection connect = Jsoup.connect("url");
Map<String> header = new HashMap<String>();
header.put("Content-type", "application/json");
Connection data = connect.headers(header);
Document document = data.get();[code=java]Provincial and urban background interface settings
This crawling setting requires relevant judgments based on the information of the corresponding data source , When making relevant settings , You need to provide the corresponding interface, which is actually the main interface ID The value transfer of is still processed in the foreground , The front desk JS When processing, you need the corresponding parent ID Come in , Backstage reception ID Just query directly , The overall business is very simple .

1.Entity Entity class design
@Data
@Entity
@Table(name = "tb_area")
@NoArgsConstructor
@EntityListeners(AuditingEntityListener.class)
public class Area {
@Id
@Column(columnDefinition = "varchar(32) comment ' Primary key id' ")
private String id;
@Column(columnDefinition = "varchar(32) comment ' Parent id' ")
private String parentId;
@Column(columnDefinition = "varchar(32) comment ' code ' ")
private String code;
@Column(columnDefinition = "varchar(32) comment ' name ' ")
private String name;
@Column(columnDefinition = "varchar(32) comment ' Grade ' ")
private String level;
@Column(columnDefinition = "datetime comment ' Creation time ' ")
@CreatedDate
private LocalDateTime createTime;
@Column(columnDefinition = "datetime comment ' Update time ' ")
@LastModifiedDate
private LocalDateTime updateTime;
@Column(columnDefinition = "tinyint(1) comment ' Whether or not to delete ' ")
private Boolean delFlag = false;2.Dao(AreaRepository) Database persistence layer settings
public interface AreaRepository extends JpaRepository<Area, String>, JpaSpecificationExecutor<Area> {
List<AreaDTO> findAllByLevelAndDelFlagIsFalse(String level);
List<AreaDTO> findAllByParentIdAndDelFlagIsFalse(String parentId);
List<AreaDTO> findAllByCodeAndDelFlagIsFalse(String code);
}
3.Service( The business layer deals with ) Are based on the corresponding DAO There is not too much complex logic for direct query
@Service
public class AreaService {
@Autowired
private AreaRepository areaRepository;
@Cacheable(cacheNames = "level:info")
public List<AreaDTO> findProvinceList(String level) {
return areaRepository.findAllByLevelAndDelFlagIsFalse(level);
}
public List<AreaDTO> findByCodeList(String code) {
return areaRepository.findAllByCodeAndDelFlagIsFalse(code);
}
@Cacheable(cacheNames = "provinceId:info")
public List<AreaDTO> findCityList(String provinceId) {
return areaRepository.findAllByParentIdAndDelFlagIsFalse(provinceId);
}
@Cacheable(cacheNames = "cityId:info")
public List<AreaDTO> findOrganyList(String cityId) {
return areaRepository.findAllByParentIdAndDelFlagIsFalse(cityId);
}
@Cacheable(cacheNames = "organId:info")
public List<AreaDTO> findStreetList(String organId) {
return areaRepository.findAllByParentIdAndDelFlagIsFalse(organId);
}4.Controller ( Control layer ) Directly through different API Call to perform display processing
@Api(tags = " Address related ")
@RestController
@RequestMapping("/area")
public class AreaController {
@Autowired
private AreaService areaService;
@ApiOperation(" Get Provincial Information ")
@GetMapping(value = "/province")
public ObjectResponse<List<AreaDTO>> getProvince(@ApiParam(value = " Administrative grade ") @RequestParam(value = "level", defaultValue = "1", required = false) String level) {
return ObjectResponse.newResponse(areaService.findProvinceList(level));
}
@ApiOperation(" Get city information ")
@GetMapping("/city")
public ObjectResponse<List<AreaDTO>> getCity(@ApiParam(value = " provincial level ID")@RequestParam(value = "province_id") String provinceId) {
return ObjectResponse.newResponse(areaService.findCityList(provinceId));
}
@ApiOperation(" Get zone information ")
@GetMapping("/organ")
public ObjectResponse<List<AreaDTO>> getOrgan(@ApiParam(value = " Municipal level ID") @RequestParam(value = "city_id") String cityId) {
return ObjectResponse.newResponse(areaService.findOrganyList(cityId));
}
@ApiOperation(" Get street information ")
@GetMapping("/street")
public ObjectResponse<List<AreaDTO>> getStreet(@ApiParam(value = " District level ID") @RequestParam(value = "organ_id") String organId) {
return ObjectResponse.newResponse(areaService.findStreetList(organId));
}
As shown above , The information of provincial and urban streets can be linked , During the operation , In fact, the core is to capture the corresponding address , Only when you have the address can you determine that the only data can be pulled through the program .
There is a place that needs attention , In the place of the Bureau of statistics, there will be a concept of three-level address in the municipal district , In fact, this address does not really exist , Just for the convenience of statistics and use . For example, this black city district , He has no subordinate streets below , But for the convenience of coding, we still need to carry out statistical operations on this


Common modules -- Page search ; Check the details ; preservation ; Interface
1. In the management system , Common modules , Paging query , And query the details of this record , Modify the information , Save information ; The following are analyzed from top to bottom ;
Controller layer ( Link request analysis )

preservation / Modification can share a new interface , When operating, you can distinguish and save details first findByID Query. If the current record does not exist, save it directly , If it exists, modify it directly
---->Controller Preservation of layers / This is the way to modify URI
@ApiOperation(" Kept in hospital / modify ")
@PostMapping()
public ObjectResponse<Void> save(@Valid @RequestBody HospitalInputDTO hospitalInputDTO) {
hospitalService.save(hospitalInputDTO);
return ObjectResponse.messageResponse(" Saved successfully ");
}
---->Service Preservation of layers / Modify the target path of the request
public void save(HospitalInputDTO inputDTO) {
Hospital hospital;
if (StringUtils.isNotEmpty(inputDTO.getId())) {
hospital = hospitalRepository.findById(inputDTO.getId()).orElseThrow(() -> new BaseException(" The hospital does not exist !"));
} else {
hospital = new Hospital();
hospital.setId(RandomGenerator.buildUUID());
}
hospital.setName(inputDTO.getName());
hospital.setBigLogo(inputDTO.getBigLogo());
hospital.setSmallLogo(inputDTO.getSmallLogo());
hospital.setLevel(inputDTO.getLevel());
hospital.setSampleFee(inputDTO.getSampleFee());
// The default registration fee is 0
hospital.setRegFee("0");
hospital.setDelFlag(Boolean.FALSE);
hospitalRepository.save(hospital);
}
When performing modification , By default, the front end will send the corresponding data DTO Stuffed inside ID, And the corresponding modified fields are backfilled to the corresponding information content , Fill in the corresponding details , Writing back , Save operation , During the operation, the corresponding ID Information , So we need to assign a new value ID operation , During the whole preservation ID All operations in the table cannot be self incremented .
Paging query / Query a detailed record , Paging query is to combine the corresponding query criteria to perform the query operation , So the overall operation is still very simple , In combination with the corresponding query, the query of a commodity details is directly based on ID The query of information is very simple .
---->Controller Paged query of layer
@ApiOperation(" List of hospitals ")
@GetMapping("/page")
public PageResponse<HospitalOutputDTO> page(@ApiIgnore Session session,
@ApiParam(value = " Hospital name ") @RequestParam(value = "name", required = false) String name,
@RequestParam(value = "page", defaultValue = "1", required = false) int page,
@RequestParam(value = "size", defaultValue = "20", required = false) int size) {
Page<HospitalOutputDTO> hospitalOutputDTOPage = hospitalService.page(session, name, page, size);
return PageResponse.newResponse(hospitalOutputDTOPage.getContent(), (int)hospitalOutputDTOPage.getTotalElements(), page, size);
}
--->Controller Query the details of a record in the layer
@ApiOperation(" Hospital details ")
@GetMapping("/detail")
public ObjectResponse<HospitalDTO> detail(@ApiParam(" Hospital primary key id") @RequestParam(value = "id") String id) {
HospitalDTO hospitalDTO = hospitalService.detail(id);
return ObjectResponse.newResponse(hospitalDTO);
}
---->Service Layer paging query , It involves dynamic condition query , Just copy directly
public Page<HospitalOutputDTO> page(Session session, String name, int page, int size) {
HospitalDTO query = new HospitalDTO();
query.setName(name);
Specification<Hospital> specification = querySpecification(query);
Pageable pageable = PageRequest.of(page - 1, size, Sort.by(Sort.Direction.DESC, "updateTime"));
Page<Hospital> antigenPage = hospitalRepository.findAll(specification, pageable);
List<HospitalOutputDTO> collect = antigenPage.stream().map(HospitalOutputDTO::new).collect(Collectors.toList());
return new PageImpl<>(collect, pageable, antigenPage.getTotalElements());
}
private Specification<Hospital> querySpecification(HospitalDTO query) {
return (root, criteriaQuery, criteriaBuilder) -> {
List<Predicate> predicateList = new ArrayList<>();
if (StringUtils.isNotEmpty(query.getName())) {
predicateList.add(criteriaBuilder.like(root.get("name").as(String.class), "%" + query.getName() + "%"));
}
predicateList.add(criteriaBuilder.equal(root.get("delFlag").as(Boolean.class), Boolean.FALSE));
return criteriaBuilder.and(predicateList.toArray(new Predicate[0]));
};
}
---->Servic Query the details of a record in the layer , Make a record directly findById Operation is OK
public HospitalDTO detail(String id) {
Hospital hospital = hospitalRepository.findById(id).orElseThrow(() -> new BaseException(" The hospital does not exist !"));
return new HospitalDTO(hospital);
}
The resource address is shown in the following figure :https://download.csdn.net/download/zgz102928/85236645
边栏推荐
- [teacher Zhao Yuqiang] redis's slow query log
- The programmer shell with a monthly salary of more than 10000 becomes a grammar skill for secondary school. Do you often use it!!!
- Detailed explanation of contextclassloader
- QT read write excel -- qxlsx insert chart 5
- Why should there be a firewall? This time xiaowai has something to say!!!
- @Import annotation: four ways to import configuration classes & source code analysis
- Apache+php+mysql environment construction is super detailed!!!
- Simple solution of small up main lottery in station B
- [advanced pointer (2)] | [function pointer, function pointer array, callback function] key analysis + code explanation
- Sorry, this user does not exist!
猜你喜欢

Kubernetes notes (VIII) kubernetes security

Disruptor learning notes: basic use, core concepts and principles

.NET程序配置文件操作(ini,cfg,config)

Strategy pattern: encapsulate changes and respond flexibly to changes in requirements

项目总结--04

Jedis source code analysis (I): jedis introduction, jedis module source code analysis

Kubesphere - Multi tenant management
![[teacher Zhao Yuqiang] Flink's dataset operator](/img/cc/5509b62756dddc6e5d4facbc6a7c5f.jpg)
[teacher Zhao Yuqiang] Flink's dataset operator

GPS坐标转百度地图坐标的方法
![[escape character] [full of dry goods] super detailed explanation + code illustration!](/img/33/ec5a5e11bfd43f53f2767a9a0f0cc9.jpg)
[escape character] [full of dry goods] super detailed explanation + code illustration!
随机推荐
项目总结--2(Jsoup的基本使用)
BeanDefinitionRegistryPostProcessor
Tabbar settings
Method of converting GPS coordinates to Baidu map coordinates
.NET程序配置文件操作(ini,cfg,config)
Code generator - single table query crud - generator
Mysql database table export and import with binary
Exportation et importation de tables de bibliothèque avec binaires MySQL
Kubernetes notes (VII) kuberetes scheduling
arcgis创建postgre企业级数据库
Kubernetes notes (V) configuration management
The most responsible command line beautification tutorial
Kubernetes notes (I) kubernetes cluster architecture
PHP用ENV获取文件参数的时候拿到的是字符串
从小数据量分库分表 MySQL 合并迁移数据到 TiDB
Detailed explanation of findloadedclass
项目总结--01(接口的增删改查;多线程的使用)
Why is the website slow to open?
CAD插件的安装和自动加载dll、arx
ThreadLocal的简单理解