当前位置:网站首页>Web page element parsing a tag
Web page element parsing a tag
2022-07-30 04:33:00 【There is no way out of sailing against the current】
前言
Parsing of web page elements,This chapter focuses on how to parseA标签.
Examples of web elements(浏览器截取,供参考)
<div class="related-tags">
<span>Related topics are recommended:</span>
a<a href='http://www.csdn.net/tag/标签' target="_blank">标签</a>
<a href="http://www.csdn.net/tag/java" target="_blank">java</a>
<a href="http://www.csdn.net/tag/团购" target="_blank">团购</a>
<a href="http://www.csdn.net/tag/体育" target="_blank">体育</a>
<a href="http://www.csdn.net/tag/搜狐" icon='a' target="_blank">搜狐</a>
<img src="/i/eg_tulip2.jpg" alt="上海鲜花港 - 郁金香"><img src="/i/eg_tulip.jpg" alt="上海鲜花港 - 郁金香" /> </div>
如何解析
Web page elements are divided into symmetrical tags and non-stacked tags.
Symmetric labels eg<a>xxx</a>,Asymmetric labels eg<img/>
(1)So here it is defined as isSingleNode,Whether it is a single node label
/** * 网页节点 * * @author leng * */
public class WebNode implements Serializable {
/** * Content in the middle of the node */
private String content;
/** * 参数 */
private TreeMap<String, String> params = new TreeMap<>();
/** * 标签类型 */
private String labelType;
/** * Whether it is a single node */
private boolean isSingleNode;
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public TreeMap<String, String> getParams() {
return params;
}
public void setParams(TreeMap<String, String> params) {
this.params = params;
}
public String getLabelType() {
return labelType;
}
public void setLabelType(String labelType) {
this.labelType = labelType;
}
public boolean isSingleNode() {
return isSingleNode;
}
public void setSingleNode(boolean isSingleNode) {
this.isSingleNode = isSingleNode;
}
}
(2)String manipulation class,辅助作用
/** * 字符串操作 * * @author leng * */
public class StrUtil {
public static boolean isEmpty(String string){
return null==string||string.equals("");
}
public static boolean isNotEmpty(String string){
return null!=string&&!string.equals("");
}
/** * 将字符串string重复n次,通过segMake a split connection * * @param string Repeated strings are required * @param n 重复次数 * @param seg 分割字符串,如果等于null,则为"" * @return */
public static String repeatString(String string, int n, String seg) {
if (null == seg) {
seg = "";
}
/** * Not without other methods to achieve,Because this method is the best way to write it */
StringBuffer sb = new StringBuffer();
for (int i = 0; i < n; i++) {
sb.append(string).append(seg);
}
return sb.substring(0, sb.length() - seg.length());
// return String.join(seg, Collections.nCopies(n, string));
}
/** * 将num重复n次,通过segMake a split symlink * * @param num Repeated values are required * @param n 重复次数 * @param seg 分割字符串,如果等于null,则为"" * @return */
public static String repeatString(int num, int n, String seg) {
return repeatString(num + "", n, seg);
}
/** * 将num重复n次,Split the string into commas"," * * @param num Repeated values are required * @param n 重复次数 * @param seg 分割字符串,如果等于null,则为"" * @return */
public static String repeatString(int num, int n) {
return repeatString(num + "", n, ",");
}
/** * 将int数组转换成字符串 * * @param intArray * @return */
public static String IntArrayToString(Integer[] intArray) {
if (null != intArray) {
StringBuffer sb = new StringBuffer();
for (Integer a : intArray) {
if (null != a) {
sb.append(",").append(a);
}
}
if (sb.length() > 0) {
return sb.substring(1);
}
}
return null;
}
/** * 字符串str是否存在于字符串array数组中 * * @param array 字符串数组 * @param str 字符串 * @param ignoreCase 是否忽略大小写 * @return */
public static boolean isExist(String[] array, String str, boolean ignoreCase) {
if (null != array && null != str) {
for (String s : array) {
if (ignoreCase) {
if (s.equalsIgnoreCase(str)) {
return true;
}
} else {
if (s.equals(str)) {
return true;
}
}
}
}
return false;
}
/** * 将字符串按照splitStr进行分割,并转换成{@code List<Integer>}集合 * * @param string * @param splitStr * @return */
public static List<Integer> parseList(String string, String splitStr) {
if (isEmpty(string)) {
return null;
}
String[] arrays = string.split(splitStr);
List<Integer> list = new ArrayList<>(arrays.length);
for (String str : arrays) {
try {
if (isNotEmpty(str)) {
Integer num = Integer.parseInt(str);
list.add(num);
}
} catch (Exception e) {
}
}
return list;
}
/** * Convert the string to commas,进行分割,并转换成{@code List<Integer>}集合 * * @param string * @return */
public static List<Integer> parseList(String string) {
return parseList(string, ",");
}
/** * 将字符串按照splitStr进行分割,并转换成{@code Set<Integer>}集合 * * @param string * @param splitStr * @return */
public static Set<Integer> parseSet(String string, String splitStr) {
if (isEmpty(string)) {
return null;
}
String[] arrays = string.split(splitStr);
Set<Integer> set = new HashSet<>();
for (String str : arrays) {
try {
if (isNotEmpty(str)) {
Integer num = Integer.parseInt(str);
set.add(num);
}
} catch (Exception e) {
}
}
return set;
}
/** * Convert the string to commas,进行分割,并转换成{@code Set<Integer>}集合 * * @param string * @return */
public static Set<Integer> parseSet(String string) {
return parseSet(string, ",");
}
/** * 将字符串按照splitStr进行分割,并转换成{@code Set<String>}集合 * * @param string * @param splitStr * @return */
public static Set<String> parseSetString(String string, String splitStr) {
if (isEmpty(string)) {
return null;
}
String[] arrays = string.split(splitStr);
Set<String> set = new HashSet<>();
for (String str : arrays) {
try {
set.add(str);
} catch (Exception e) {
}
}
return set;
}
/** * 将字符串转换成map,示例 string="1-2,3-4,5-6",seg1=",",seg2="-" 转换为map后,{1=2,3=4,5=6} * * @param string 被切割的字符串 * @param seg1 分割符号1 * @param seg2 分割符号2 * @return */
public static Map<String, String> stringToMap(String string, String seg1, String seg2) {
if (isNotEmpty(string)) {
String[] arr = string.split(seg1);
Map<String, String> map = new HashMap<>(arr.length);
for (String s : arr) {
if (isNotEmpty(s)) {
String[] arr2 = s.split(seg2);
if (arr2.length == 1) {
map.put(arr2[0], null);
} else if (arr2.length >= 2) {
map.put(arr2[0], arr2[1]);
}
}
}
return map;
}
return null;
}
/** * 将字符串转换成map,示例 string="1-2,3-4,5-6" 转换为map后,{1=2,3=4,5=6} * * @param string * @return */
public static Map<String, String> stringToMap(String string) {
return stringToMap(string, ",", "-");
}
/** * Replace the first comma,If the beginning of the string is not a comma,will be returned as is * * @param str * @return */
public static String repaceFirstComma(String str) {
if (null == str) {
return null;
}
if (str.startsWith(",")) {
return str.substring(1);
}
return str;
}
/** * 字符串数组转int数组 * * @param array * @return */
public static int[] strArrayToIntArray(String[] array) {
int[] intArray = new int[0];
for (String s : array) {
if (isNotEmpty(s)) {
intArray = Arrays.copyOf(intArray, intArray.length + 1);
intArray[intArray.length - 1] = Integer.parseInt(s);
}
}
return intArray;
}
/** * Remove spaces before and after the specified character * * @param string * @param specialStr * @return */
public static String trim(String string, String specialStr) {
if (null == string) {
return null;
}
String s1 = " " + specialStr;
while (string.indexOf(s1) > -1) {
string = string.replaceAll(s1, specialStr);
}
String s2 = specialStr + " ";
while (string.indexOf(s2) > -1) {
string = string.replaceAll(s2, specialStr);
}
return string;
}
public static void main(String[] args) {
int times = 300_000_000;
}
}
(3)core analysis
/** * 网页工具 * * @author leng * */
public class WebpageUtil {
/** * Get the title from the web page * * @param html * @return */
public static String getTitleFromHtmlString(String html) {
int startIndex = html.indexOf("<title>");
int endIndex = html.indexOf("</title>");
if (startIndex == -1 || endIndex == -1) {
return null;
}
String title = html.substring(startIndex + ("<title>".length()), endIndex);
return title;
}
/** * 获取节点(Nodes contained within nodes are not supported) * * @param html * @return */
public static List<String> getNode(String html, String nodeType) {
String endStr = "</" + nodeType + ">";
String regex;
if (html.indexOf(endStr) > -1) {
regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'])*>(.*?)</a>";// Elements of paired labels(例如<a></a>)
} else {
regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'|\\s])*\\s*\\/?>";// A single-label element(例如<img>)
}
regex = regex.replaceAll("a", nodeType);
Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(html);
ArrayList<String> list = new ArrayList<String>();
while (m.find()) {
String link = m.group(0).trim();
list.add(link);
}
return list;
}
/** * 获取节点 * * @param html * @return */
public static List<WebNode> getNode2(String html, String labelType) {
String endStr = "</" + labelType + ">";
String regex;
boolean isSingleNode = false;
if (html.indexOf(endStr) > -1) {
regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'])*>(.*?)</a>";// Elements of paired labels(例如<a></a>)
} else {
isSingleNode = true;
regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'|\\s])*\\s*\\/?>";// A single-label element(例如<img>)
}
regex = regex.replaceAll("a", labelType);
Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(html);
List<WebNode> list = new ArrayList<>();
while (m.find()) {
WebNode node = new WebNode();
node.setLabelType(labelType);
node.setSingleNode(isSingleNode);
String link = m.group(0).trim();
if (!isSingleNode) {
int startIndex = link.substring(0, link.length() - 2).lastIndexOf(">");
int endIndex = link.lastIndexOf("<");
if (startIndex > -1 && endIndex > -1 && endIndex > startIndex) {
String content = link.substring(startIndex + 1, endIndex);
node.setContent(content);
}
}
int startIndex2 = link.indexOf("<" + labelType);
String str = null;
if (isSingleNode) {
str = link.substring(startIndex2 + ("<" + labelType).length());
} else {
int startIndex = link.substring(0, link.length() - 2).lastIndexOf(">");
str = link.substring(startIndex2 + ("<" + labelType).length(), startIndex);
}
String[] arr = StrUtil.trim(str, "=").split(" ");
if (null != arr && arr.length > 0) {
TreeMap<String, String> params = new TreeMap<>();
for (String s : arr) {
if (s.indexOf("=") > -1) {
String key = s.substring(0, s.indexOf("="));
String value = s.substring(s.indexOf("=") + 1);
String k = key, v = value;
if (key.startsWith("\'") || key.startsWith("\"")) {
k = key.substring(1, key.length() - 1).trim();
}
if (value.startsWith("\'") || value.startsWith("\"")) {
v = value.substring(1, value.length() - 1).trim();
}
params.put(k, v);
}
}
node.setParams(params);
}
list.add(node);
}
return list;
}
}
(4)测试
public static void main(String[] args) {
String str = "<div class=\"related-tags\">\n" + " <span>Related topics are recommended:</span>\n"
+ " a<a href=\'http://www.csdn.net/tag/标签\' target=\"_blank\">标签</a>\n"
+ " <a href=\"http://www.csdn.net/tag/java\" target=\"_blank\">java</a>\n"
+ " <a href=\"http://www.csdn.net/tag/团购\" target=\"_blank\">团购</a>\n"
+ " <a href=\"http://www.csdn.net/tag/体育\" target=\"_blank\">体育</a>\n"
+ " <a href=\"http://www.csdn.net/tag/搜狐\" icon='a' target=\"_blank\">搜狐</a>\n"
+ "<img src=\"/i/eg_tulip2.jpg\" alt=\"上海鲜花港 - 郁金香\"><img src=\"/i/eg_tulip.jpg\" alt=\"上海鲜花港 - 郁金香\" /> </div>";
List<String> list = WebpageUtil.getNode(str, "a");
System.out.println("-------aThe original content of the label------");
list.forEach(c -> {
System.out.println(c);
});
List<WebNode> list2 = WebpageUtil.getNode2(str, "a");
System.out.println("-------解析后的数据------");
list2.forEach(c -> {
System.out.println("内容:"+c.getContent());
System.out.println("标签类型:"+c.getLabelType());
System.out.println("标签的参数:"+c.getParams());
});
}
执行后输出,可以看到网页div里的aThe tag information is parsed out.
边栏推荐
- Data Lake: Data Integration Tool DataX
- 《构建之法》笔记---第十章 典型用户和场景
- DAY17: weak password detection and test
- - B + tree index and MySQL series 】 【 what is the difference between a HASH index
- Thymeleaf简介
- sqlmap use tutorial Daquan command Daquan (graphics)
- 海外多家权威媒体热议波场TRON:为互联网去中心化奠定基础
- Taobao H5 interface to obtain app data 6.0 format
- Code open source design and implementation ideas
- The Azure developer news 丨 memorabilia in July
猜你喜欢

Chapter8 Support Vector Machines

What is CDH/CDP?

Based on all volunteers - H and D1 XR806 rare plant monitoring device

【C语言】程序环境和预处理

MySQL 操作语句大全(详细)

【 notes 】 the beauty of the software engineering - column 31 | software testing are responsible for the quality of products?

在麒麟V10操作系统上安装MySQL数据库

Shanxi group (enterprises) in the second network security skills competition part problem WP (7)

swagger使用教程——快速使用swagger

GCC Rust获批将被纳入主线代码库,或将于GCC 13中与大家见面
随机推荐
图像视角矫正之透视变换矩阵(单应矩阵)/findHomography 与 getPerspectiveTransformd的区别
Data Lake: Data Integration Tool DataX
网页元素解析a标签
MySQL 安装报错的解决方法
2.4 hill sorting
DAY17:弱口令的探测与测试
复现XXL-JOB 任务调度中心后台任意命令执行漏洞
Boss Rush (二分答案 + 状压DP)
MySQL 操作语句大全(详细)
【线性表】- LeetCode力扣三道练习题详解
海外多家权威媒体热议波场TRON:为互联网去中心化奠定基础
精品MySQL面试题,备战八月99%必问!过不了面试算我的
js operation to add or subtract from the current date (day, week, month, year)
state space representation
山西省第二届网络安全技能大赛(企业组)部分赛题WP(十)
2021山东省网络搭建与应用赛项试题
2.5快速排序
2.6 Radix sort (bucket sort)
如何与墨西哥大众VW Mexico建立EDI连接
Introduction to Thymeleaf