当前位置:网站首页>Web page element parsing a tag
Web page element parsing a tag
2022-07-30 04:33:00 【There is no way out of sailing against the current】
前言
Parsing of web page elements,This chapter focuses on how to parseA标签.
Examples of web elements(浏览器截取,供参考)
<div class="related-tags">
<span>Related topics are recommended:</span>
a<a href='http://www.csdn.net/tag/标签' target="_blank">标签</a>
<a href="http://www.csdn.net/tag/java" target="_blank">java</a>
<a href="http://www.csdn.net/tag/团购" target="_blank">团购</a>
<a href="http://www.csdn.net/tag/体育" target="_blank">体育</a>
<a href="http://www.csdn.net/tag/搜狐" icon='a' target="_blank">搜狐</a>
<img src="/i/eg_tulip2.jpg" alt="上海鲜花港 - 郁金香"><img src="/i/eg_tulip.jpg" alt="上海鲜花港 - 郁金香" /> </div>
如何解析
Web page elements are divided into symmetrical tags and non-stacked tags.
Symmetric labels eg<a>xxx</a>,Asymmetric labels eg<img/>
(1)So here it is defined as isSingleNode,Whether it is a single node label
/** * 网页节点 * * @author leng * */
public class WebNode implements Serializable {
/** * Content in the middle of the node */
private String content;
/** * 参数 */
private TreeMap<String, String> params = new TreeMap<>();
/** * 标签类型 */
private String labelType;
/** * Whether it is a single node */
private boolean isSingleNode;
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public TreeMap<String, String> getParams() {
return params;
}
public void setParams(TreeMap<String, String> params) {
this.params = params;
}
public String getLabelType() {
return labelType;
}
public void setLabelType(String labelType) {
this.labelType = labelType;
}
public boolean isSingleNode() {
return isSingleNode;
}
public void setSingleNode(boolean isSingleNode) {
this.isSingleNode = isSingleNode;
}
}
(2)String manipulation class,辅助作用
/** * 字符串操作 * * @author leng * */
public class StrUtil {
public static boolean isEmpty(String string){
return null==string||string.equals("");
}
public static boolean isNotEmpty(String string){
return null!=string&&!string.equals("");
}
/** * 将字符串string重复n次,通过segMake a split connection * * @param string Repeated strings are required * @param n 重复次数 * @param seg 分割字符串,如果等于null,则为"" * @return */
public static String repeatString(String string, int n, String seg) {
if (null == seg) {
seg = "";
}
/** * Not without other methods to achieve,Because this method is the best way to write it */
StringBuffer sb = new StringBuffer();
for (int i = 0; i < n; i++) {
sb.append(string).append(seg);
}
return sb.substring(0, sb.length() - seg.length());
// return String.join(seg, Collections.nCopies(n, string));
}
/** * 将num重复n次,通过segMake a split symlink * * @param num Repeated values are required * @param n 重复次数 * @param seg 分割字符串,如果等于null,则为"" * @return */
public static String repeatString(int num, int n, String seg) {
return repeatString(num + "", n, seg);
}
/** * 将num重复n次,Split the string into commas"," * * @param num Repeated values are required * @param n 重复次数 * @param seg 分割字符串,如果等于null,则为"" * @return */
public static String repeatString(int num, int n) {
return repeatString(num + "", n, ",");
}
/** * 将int数组转换成字符串 * * @param intArray * @return */
public static String IntArrayToString(Integer[] intArray) {
if (null != intArray) {
StringBuffer sb = new StringBuffer();
for (Integer a : intArray) {
if (null != a) {
sb.append(",").append(a);
}
}
if (sb.length() > 0) {
return sb.substring(1);
}
}
return null;
}
/** * 字符串str是否存在于字符串array数组中 * * @param array 字符串数组 * @param str 字符串 * @param ignoreCase 是否忽略大小写 * @return */
public static boolean isExist(String[] array, String str, boolean ignoreCase) {
if (null != array && null != str) {
for (String s : array) {
if (ignoreCase) {
if (s.equalsIgnoreCase(str)) {
return true;
}
} else {
if (s.equals(str)) {
return true;
}
}
}
}
return false;
}
/** * 将字符串按照splitStr进行分割,并转换成{@code List<Integer>}集合 * * @param string * @param splitStr * @return */
public static List<Integer> parseList(String string, String splitStr) {
if (isEmpty(string)) {
return null;
}
String[] arrays = string.split(splitStr);
List<Integer> list = new ArrayList<>(arrays.length);
for (String str : arrays) {
try {
if (isNotEmpty(str)) {
Integer num = Integer.parseInt(str);
list.add(num);
}
} catch (Exception e) {
}
}
return list;
}
/** * Convert the string to commas,进行分割,并转换成{@code List<Integer>}集合 * * @param string * @return */
public static List<Integer> parseList(String string) {
return parseList(string, ",");
}
/** * 将字符串按照splitStr进行分割,并转换成{@code Set<Integer>}集合 * * @param string * @param splitStr * @return */
public static Set<Integer> parseSet(String string, String splitStr) {
if (isEmpty(string)) {
return null;
}
String[] arrays = string.split(splitStr);
Set<Integer> set = new HashSet<>();
for (String str : arrays) {
try {
if (isNotEmpty(str)) {
Integer num = Integer.parseInt(str);
set.add(num);
}
} catch (Exception e) {
}
}
return set;
}
/** * Convert the string to commas,进行分割,并转换成{@code Set<Integer>}集合 * * @param string * @return */
public static Set<Integer> parseSet(String string) {
return parseSet(string, ",");
}
/** * 将字符串按照splitStr进行分割,并转换成{@code Set<String>}集合 * * @param string * @param splitStr * @return */
public static Set<String> parseSetString(String string, String splitStr) {
if (isEmpty(string)) {
return null;
}
String[] arrays = string.split(splitStr);
Set<String> set = new HashSet<>();
for (String str : arrays) {
try {
set.add(str);
} catch (Exception e) {
}
}
return set;
}
/** * 将字符串转换成map,示例 string="1-2,3-4,5-6",seg1=",",seg2="-" 转换为map后,{1=2,3=4,5=6} * * @param string 被切割的字符串 * @param seg1 分割符号1 * @param seg2 分割符号2 * @return */
public static Map<String, String> stringToMap(String string, String seg1, String seg2) {
if (isNotEmpty(string)) {
String[] arr = string.split(seg1);
Map<String, String> map = new HashMap<>(arr.length);
for (String s : arr) {
if (isNotEmpty(s)) {
String[] arr2 = s.split(seg2);
if (arr2.length == 1) {
map.put(arr2[0], null);
} else if (arr2.length >= 2) {
map.put(arr2[0], arr2[1]);
}
}
}
return map;
}
return null;
}
/** * 将字符串转换成map,示例 string="1-2,3-4,5-6" 转换为map后,{1=2,3=4,5=6} * * @param string * @return */
public static Map<String, String> stringToMap(String string) {
return stringToMap(string, ",", "-");
}
/** * Replace the first comma,If the beginning of the string is not a comma,will be returned as is * * @param str * @return */
public static String repaceFirstComma(String str) {
if (null == str) {
return null;
}
if (str.startsWith(",")) {
return str.substring(1);
}
return str;
}
/** * 字符串数组转int数组 * * @param array * @return */
public static int[] strArrayToIntArray(String[] array) {
int[] intArray = new int[0];
for (String s : array) {
if (isNotEmpty(s)) {
intArray = Arrays.copyOf(intArray, intArray.length + 1);
intArray[intArray.length - 1] = Integer.parseInt(s);
}
}
return intArray;
}
/** * Remove spaces before and after the specified character * * @param string * @param specialStr * @return */
public static String trim(String string, String specialStr) {
if (null == string) {
return null;
}
String s1 = " " + specialStr;
while (string.indexOf(s1) > -1) {
string = string.replaceAll(s1, specialStr);
}
String s2 = specialStr + " ";
while (string.indexOf(s2) > -1) {
string = string.replaceAll(s2, specialStr);
}
return string;
}
public static void main(String[] args) {
int times = 300_000_000;
}
}
(3)core analysis
/** * 网页工具 * * @author leng * */
public class WebpageUtil {
/** * Get the title from the web page * * @param html * @return */
public static String getTitleFromHtmlString(String html) {
int startIndex = html.indexOf("<title>");
int endIndex = html.indexOf("</title>");
if (startIndex == -1 || endIndex == -1) {
return null;
}
String title = html.substring(startIndex + ("<title>".length()), endIndex);
return title;
}
/** * 获取节点(Nodes contained within nodes are not supported) * * @param html * @return */
public static List<String> getNode(String html, String nodeType) {
String endStr = "</" + nodeType + ">";
String regex;
if (html.indexOf(endStr) > -1) {
regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'])*>(.*?)</a>";// Elements of paired labels(例如<a></a>)
} else {
regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'|\\s])*\\s*\\/?>";// A single-label element(例如<img>)
}
regex = regex.replaceAll("a", nodeType);
Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(html);
ArrayList<String> list = new ArrayList<String>();
while (m.find()) {
String link = m.group(0).trim();
list.add(link);
}
return list;
}
/** * 获取节点 * * @param html * @return */
public static List<WebNode> getNode2(String html, String labelType) {
String endStr = "</" + labelType + ">";
String regex;
boolean isSingleNode = false;
if (html.indexOf(endStr) > -1) {
regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'])*>(.*?)</a>";// Elements of paired labels(例如<a></a>)
} else {
isSingleNode = true;
regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'|\\s])*\\s*\\/?>";// A single-label element(例如<img>)
}
regex = regex.replaceAll("a", labelType);
Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(html);
List<WebNode> list = new ArrayList<>();
while (m.find()) {
WebNode node = new WebNode();
node.setLabelType(labelType);
node.setSingleNode(isSingleNode);
String link = m.group(0).trim();
if (!isSingleNode) {
int startIndex = link.substring(0, link.length() - 2).lastIndexOf(">");
int endIndex = link.lastIndexOf("<");
if (startIndex > -1 && endIndex > -1 && endIndex > startIndex) {
String content = link.substring(startIndex + 1, endIndex);
node.setContent(content);
}
}
int startIndex2 = link.indexOf("<" + labelType);
String str = null;
if (isSingleNode) {
str = link.substring(startIndex2 + ("<" + labelType).length());
} else {
int startIndex = link.substring(0, link.length() - 2).lastIndexOf(">");
str = link.substring(startIndex2 + ("<" + labelType).length(), startIndex);
}
String[] arr = StrUtil.trim(str, "=").split(" ");
if (null != arr && arr.length > 0) {
TreeMap<String, String> params = new TreeMap<>();
for (String s : arr) {
if (s.indexOf("=") > -1) {
String key = s.substring(0, s.indexOf("="));
String value = s.substring(s.indexOf("=") + 1);
String k = key, v = value;
if (key.startsWith("\'") || key.startsWith("\"")) {
k = key.substring(1, key.length() - 1).trim();
}
if (value.startsWith("\'") || value.startsWith("\"")) {
v = value.substring(1, value.length() - 1).trim();
}
params.put(k, v);
}
}
node.setParams(params);
}
list.add(node);
}
return list;
}
}
(4)测试
public static void main(String[] args) {
String str = "<div class=\"related-tags\">\n" + " <span>Related topics are recommended:</span>\n"
+ " a<a href=\'http://www.csdn.net/tag/标签\' target=\"_blank\">标签</a>\n"
+ " <a href=\"http://www.csdn.net/tag/java\" target=\"_blank\">java</a>\n"
+ " <a href=\"http://www.csdn.net/tag/团购\" target=\"_blank\">团购</a>\n"
+ " <a href=\"http://www.csdn.net/tag/体育\" target=\"_blank\">体育</a>\n"
+ " <a href=\"http://www.csdn.net/tag/搜狐\" icon='a' target=\"_blank\">搜狐</a>\n"
+ "<img src=\"/i/eg_tulip2.jpg\" alt=\"上海鲜花港 - 郁金香\"><img src=\"/i/eg_tulip.jpg\" alt=\"上海鲜花港 - 郁金香\" /> </div>";
List<String> list = WebpageUtil.getNode(str, "a");
System.out.println("-------aThe original content of the label------");
list.forEach(c -> {
System.out.println(c);
});
List<WebNode> list2 = WebpageUtil.getNode2(str, "a");
System.out.println("-------解析后的数据------");
list2.forEach(c -> {
System.out.println("内容:"+c.getContent());
System.out.println("标签类型:"+c.getLabelType());
System.out.println("标签的参数:"+c.getParams());
});
}
执行后输出,可以看到网页div里的aThe tag information is parsed out.
边栏推荐
- 我的Go+语言初体验——祝福留言小系统,让她也可以感受到你的祝福
- Notes on "The Law of Construction"---Chapter 10 Typical Users and Scenarios
- MNIST of Dataset: MNIST (handwritten digital image recognition + ubyte.gz file) data set introduction, download, usage (including data enhancement) detailed guide
- 使用EFR32作为Zigbee/Thread的sniffer的用法
- sqlmap use tutorial Daquan command Daquan (graphics)
- phpoffice edit excel document
- The leap second that may cause the next "Millennium Bug" is boycotted by tech giants
- Thinkphp 5.0.24变量覆盖漏洞导致RCE分析
- The 2nd Shanxi Province Network Security Skills Competition (Enterprise Group) Part of the WP (9)
- PyG搭建R-GCN实现节点分类
猜你喜欢

【Redis高手修炼之路】Jedis——Jedis的基本使用

MySql 怎么查出符合条件的最新的数据行?

How does the Snapdragon 7 series chip perform?Reno8 Pro proves a new generation of God U

What are Redis server startup after the operation?

VUX Datetime 组件compute-days-function动态设置日期列表

DAY17、CSRF 漏洞

How does MySql find out the latest data row that meets the conditions?
![[Linear table] - Detailed explanation of three practice questions of LeetCode](/img/71/91ba0cc16fe062c1ac9e77e1cc8aa2.png)
[Linear table] - Detailed explanation of three practice questions of LeetCode

Shanxi group (enterprises) in the second network security skills competition part problem WP (7)

What is CDH/CDP?
随机推荐
KubeMeet 报名 | 「边缘原生」线上技术沙龙完整议程公布!
Simple experiment with BGP
The 2nd Shanxi Province Network Security Skills Competition (Enterprise Group) Partial WP (10)
C. Qualification Rounds(思维,特情)
MySQL installation error solution
Atomic Guarantees of Redis Distributed Locks
@WebServlet注解(Servlet注解)
共建共享数字世界的根:阿里云打造全面的云原生开源生态
Shanxi group (enterprises) in the second network security skills competition part problem WP (7)
Go 学习笔记(84)— Go 项目目录结构
Unity3D Application模拟进入前后台及暂停
[The Mystery of Cloud Native] Cloud Native Background && Definition && Detailed explanation of related technologies?
DAY17:弱口令的探测与测试
《构建之法》笔记---第十章 典型用户和场景
unity初学5 摄像机跟随,边界控制以及简单的粒子控制(2d)
A must see for software testers!Database knowledge MySQL query statement Daquan
Unity beginner 5 cameras follow, border control and simple particle control (2 d)
我的Go+语言初体验——祝福留言小系统,让她也可以感受到你的祝福
MySQL data query (subtotal and sorting)
KubeMeet Registration | The complete agenda of the "Edge Native" Online Technology Salon has been announced!