当前位置:网站首页>网页元素解析a标签
网页元素解析a标签
2022-07-30 04:10:00 【逆水行舟没有退路】
前言
网页元素的解析,本章重点如何解析A标签。
网页元素示例(浏览器截取,供参考)
<div class="related-tags">
<span>相关主题推荐:</span>
a<a href='http://www.csdn.net/tag/标签' target="_blank">标签</a>
<a href="http://www.csdn.net/tag/java" target="_blank">java</a>
<a href="http://www.csdn.net/tag/团购" target="_blank">团购</a>
<a href="http://www.csdn.net/tag/体育" target="_blank">体育</a>
<a href="http://www.csdn.net/tag/搜狐" icon='a' target="_blank">搜狐</a>
<img src="/i/eg_tulip2.jpg" alt="上海鲜花港 - 郁金香"><img src="/i/eg_tulip.jpg" alt="上海鲜花港 - 郁金香" /> </div>
如何解析
网页元素分为对称的标签和非堆成标签。
对称标签如<a>xxx</a>,非对称标签如<img/>
(1)因此此处定义为isSingleNode,是否为单节点标签
/** * 网页节点 * * @author leng * */
public class WebNode implements Serializable {
/** * 节点中间的内容 */
private String content;
/** * 参数 */
private TreeMap<String, String> params = new TreeMap<>();
/** * 标签类型 */
private String labelType;
/** * 是否单节点 */
private boolean isSingleNode;
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public TreeMap<String, String> getParams() {
return params;
}
public void setParams(TreeMap<String, String> params) {
this.params = params;
}
public String getLabelType() {
return labelType;
}
public void setLabelType(String labelType) {
this.labelType = labelType;
}
public boolean isSingleNode() {
return isSingleNode;
}
public void setSingleNode(boolean isSingleNode) {
this.isSingleNode = isSingleNode;
}
}
(2)字符串的操作类,辅助作用
/** * 字符串操作 * * @author leng * */
public class StrUtil {
public static boolean isEmpty(String string){
return null==string||string.equals("");
}
public static boolean isNotEmpty(String string){
return null!=string&&!string.equals("");
}
/** * 将字符串string重复n次,通过seg进行分割连接 * * @param string 需要重复的字符串 * @param n 重复次数 * @param seg 分割字符串,如果等于null,则为"" * @return */
public static String repeatString(String string, int n, String seg) {
if (null == seg) {
seg = "";
}
/** * 并非不用其他方法实现,因为这种方法是性能最好的写法 */
StringBuffer sb = new StringBuffer();
for (int i = 0; i < n; i++) {
sb.append(string).append(seg);
}
return sb.substring(0, sb.length() - seg.length());
// return String.join(seg, Collections.nCopies(n, string));
}
/** * 将num重复n次,通过seg进行分割符号连接 * * @param num 需要重复的数值 * @param n 重复次数 * @param seg 分割字符串,如果等于null,则为"" * @return */
public static String repeatString(int num, int n, String seg) {
return repeatString(num + "", n, seg);
}
/** * 将num重复n次,分割字符串为英文逗号"," * * @param num 需要重复的数值 * @param n 重复次数 * @param seg 分割字符串,如果等于null,则为"" * @return */
public static String repeatString(int num, int n) {
return repeatString(num + "", n, ",");
}
/** * 将int数组转换成字符串 * * @param intArray * @return */
public static String IntArrayToString(Integer[] intArray) {
if (null != intArray) {
StringBuffer sb = new StringBuffer();
for (Integer a : intArray) {
if (null != a) {
sb.append(",").append(a);
}
}
if (sb.length() > 0) {
return sb.substring(1);
}
}
return null;
}
/** * 字符串str是否存在于字符串array数组中 * * @param array 字符串数组 * @param str 字符串 * @param ignoreCase 是否忽略大小写 * @return */
public static boolean isExist(String[] array, String str, boolean ignoreCase) {
if (null != array && null != str) {
for (String s : array) {
if (ignoreCase) {
if (s.equalsIgnoreCase(str)) {
return true;
}
} else {
if (s.equals(str)) {
return true;
}
}
}
}
return false;
}
/** * 将字符串按照splitStr进行分割,并转换成{@code List<Integer>}集合 * * @param string * @param splitStr * @return */
public static List<Integer> parseList(String string, String splitStr) {
if (isEmpty(string)) {
return null;
}
String[] arrays = string.split(splitStr);
List<Integer> list = new ArrayList<>(arrays.length);
for (String str : arrays) {
try {
if (isNotEmpty(str)) {
Integer num = Integer.parseInt(str);
list.add(num);
}
} catch (Exception e) {
}
}
return list;
}
/** * 将字符串按照逗号,进行分割,并转换成{@code List<Integer>}集合 * * @param string * @return */
public static List<Integer> parseList(String string) {
return parseList(string, ",");
}
/** * 将字符串按照splitStr进行分割,并转换成{@code Set<Integer>}集合 * * @param string * @param splitStr * @return */
public static Set<Integer> parseSet(String string, String splitStr) {
if (isEmpty(string)) {
return null;
}
String[] arrays = string.split(splitStr);
Set<Integer> set = new HashSet<>();
for (String str : arrays) {
try {
if (isNotEmpty(str)) {
Integer num = Integer.parseInt(str);
set.add(num);
}
} catch (Exception e) {
}
}
return set;
}
/** * 将字符串按照逗号,进行分割,并转换成{@code Set<Integer>}集合 * * @param string * @return */
public static Set<Integer> parseSet(String string) {
return parseSet(string, ",");
}
/** * 将字符串按照splitStr进行分割,并转换成{@code Set<String>}集合 * * @param string * @param splitStr * @return */
public static Set<String> parseSetString(String string, String splitStr) {
if (isEmpty(string)) {
return null;
}
String[] arrays = string.split(splitStr);
Set<String> set = new HashSet<>();
for (String str : arrays) {
try {
set.add(str);
} catch (Exception e) {
}
}
return set;
}
/** * 将字符串转换成map,示例 string="1-2,3-4,5-6",seg1=",",seg2="-" 转换为map后,{1=2,3=4,5=6} * * @param string 被切割的字符串 * @param seg1 分割符号1 * @param seg2 分割符号2 * @return */
public static Map<String, String> stringToMap(String string, String seg1, String seg2) {
if (isNotEmpty(string)) {
String[] arr = string.split(seg1);
Map<String, String> map = new HashMap<>(arr.length);
for (String s : arr) {
if (isNotEmpty(s)) {
String[] arr2 = s.split(seg2);
if (arr2.length == 1) {
map.put(arr2[0], null);
} else if (arr2.length >= 2) {
map.put(arr2[0], arr2[1]);
}
}
}
return map;
}
return null;
}
/** * 将字符串转换成map,示例 string="1-2,3-4,5-6" 转换为map后,{1=2,3=4,5=6} * * @param string * @return */
public static Map<String, String> stringToMap(String string) {
return stringToMap(string, ",", "-");
}
/** * 替换首逗号,如果字符串的开头不是逗号,则原样返还 * * @param str * @return */
public static String repaceFirstComma(String str) {
if (null == str) {
return null;
}
if (str.startsWith(",")) {
return str.substring(1);
}
return str;
}
/** * 字符串数组转int数组 * * @param array * @return */
public static int[] strArrayToIntArray(String[] array) {
int[] intArray = new int[0];
for (String s : array) {
if (isNotEmpty(s)) {
intArray = Arrays.copyOf(intArray, intArray.length + 1);
intArray[intArray.length - 1] = Integer.parseInt(s);
}
}
return intArray;
}
/** * 去掉指定字符前后的空格 * * @param string * @param specialStr * @return */
public static String trim(String string, String specialStr) {
if (null == string) {
return null;
}
String s1 = " " + specialStr;
while (string.indexOf(s1) > -1) {
string = string.replaceAll(s1, specialStr);
}
String s2 = specialStr + " ";
while (string.indexOf(s2) > -1) {
string = string.replaceAll(s2, specialStr);
}
return string;
}
public static void main(String[] args) {
int times = 300_000_000;
}
}
(3)核心的解析
/** * 网页工具 * * @author leng * */
public class WebpageUtil {
/** * 从网页中获取标题 * * @param html * @return */
public static String getTitleFromHtmlString(String html) {
int startIndex = html.indexOf("<title>");
int endIndex = html.indexOf("</title>");
if (startIndex == -1 || endIndex == -1) {
return null;
}
String title = html.substring(startIndex + ("<title>".length()), endIndex);
return title;
}
/** * 获取节点(不支持节点内包含节点) * * @param html * @return */
public static List<String> getNode(String html, String nodeType) {
String endStr = "</" + nodeType + ">";
String regex;
if (html.indexOf(endStr) > -1) {
regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'])*>(.*?)</a>";// 成对标签的元素(例如<a></a>)
} else {
regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'|\\s])*\\s*\\/?>";// 单标签的元素(例如<img>)
}
regex = regex.replaceAll("a", nodeType);
Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(html);
ArrayList<String> list = new ArrayList<String>();
while (m.find()) {
String link = m.group(0).trim();
list.add(link);
}
return list;
}
/** * 获取节点 * * @param html * @return */
public static List<WebNode> getNode2(String html, String labelType) {
String endStr = "</" + labelType + ">";
String regex;
boolean isSingleNode = false;
if (html.indexOf(endStr) > -1) {
regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'])*>(.*?)</a>";// 成对标签的元素(例如<a></a>)
} else {
isSingleNode = true;
regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'|\\s])*\\s*\\/?>";// 单标签的元素(例如<img>)
}
regex = regex.replaceAll("a", labelType);
Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(html);
List<WebNode> list = new ArrayList<>();
while (m.find()) {
WebNode node = new WebNode();
node.setLabelType(labelType);
node.setSingleNode(isSingleNode);
String link = m.group(0).trim();
if (!isSingleNode) {
int startIndex = link.substring(0, link.length() - 2).lastIndexOf(">");
int endIndex = link.lastIndexOf("<");
if (startIndex > -1 && endIndex > -1 && endIndex > startIndex) {
String content = link.substring(startIndex + 1, endIndex);
node.setContent(content);
}
}
int startIndex2 = link.indexOf("<" + labelType);
String str = null;
if (isSingleNode) {
str = link.substring(startIndex2 + ("<" + labelType).length());
} else {
int startIndex = link.substring(0, link.length() - 2).lastIndexOf(">");
str = link.substring(startIndex2 + ("<" + labelType).length(), startIndex);
}
String[] arr = StrUtil.trim(str, "=").split(" ");
if (null != arr && arr.length > 0) {
TreeMap<String, String> params = new TreeMap<>();
for (String s : arr) {
if (s.indexOf("=") > -1) {
String key = s.substring(0, s.indexOf("="));
String value = s.substring(s.indexOf("=") + 1);
String k = key, v = value;
if (key.startsWith("\'") || key.startsWith("\"")) {
k = key.substring(1, key.length() - 1).trim();
}
if (value.startsWith("\'") || value.startsWith("\"")) {
v = value.substring(1, value.length() - 1).trim();
}
params.put(k, v);
}
}
node.setParams(params);
}
list.add(node);
}
return list;
}
}
(4)测试
public static void main(String[] args) {
String str = "<div class=\"related-tags\">\n" + " <span>相关主题推荐:</span>\n"
+ " a<a href=\'http://www.csdn.net/tag/标签\' target=\"_blank\">标签</a>\n"
+ " <a href=\"http://www.csdn.net/tag/java\" target=\"_blank\">java</a>\n"
+ " <a href=\"http://www.csdn.net/tag/团购\" target=\"_blank\">团购</a>\n"
+ " <a href=\"http://www.csdn.net/tag/体育\" target=\"_blank\">体育</a>\n"
+ " <a href=\"http://www.csdn.net/tag/搜狐\" icon='a' target=\"_blank\">搜狐</a>\n"
+ "<img src=\"/i/eg_tulip2.jpg\" alt=\"上海鲜花港 - 郁金香\"><img src=\"/i/eg_tulip.jpg\" alt=\"上海鲜花港 - 郁金香\" /> </div>";
List<String> list = WebpageUtil.getNode(str, "a");
System.out.println("-------a标签原内容------");
list.forEach(c -> {
System.out.println(c);
});
List<WebNode> list2 = WebpageUtil.getNode2(str, "a");
System.out.println("-------解析后的数据------");
list2.forEach(c -> {
System.out.println("内容:"+c.getContent());
System.out.println("标签类型:"+c.getLabelType());
System.out.println("标签的参数:"+c.getParams());
});
}
执行后输出,可以看到网页div里的a标签信息被解析出来。
边栏推荐
- [The Mystery of Cloud Native] Cloud Native Background && Definition && Detailed explanation of related technologies?
- Shell脚本基本编辑规范及变量
- [Node accesses MongoDB database]
- JQ源码分析(环境处理)
- FreeRTOS Personal Notes - Memory Management
- Mini Program Graduation Works WeChat Points Mall Mini Program Graduation Design Finished Work (5) Task Book
- Why is the Kirin 9000 5G version suddenly back in stock?
- Data Lake: Data Integration Tool DataX
- 【转】Swift 中的面向协议编程:引言
- QT(39)-vs开发qt程序提示无法打开源文件
猜你喜欢

逆向理论知识3【UI修改篇】

弘玑再度入围Gartner 2022 RPA魔力象限并实现位置大幅跃升

High Concurrency Framework Disruptor

函数的底层机制

国内首家沉浸式高逼真元宇宙,希元宇宙正式上线

Based on all volunteers - H and D1 XR806 rare plant monitoring device

day10--install mysql on linux

ospf 综合实验(重发布,特殊区域)

(6) "Digital Electricity" - Diodes and CMOS Gate Circuits (Introduction)

Pytorch framework learning record 5 - the use of DataLoader
随机推荐
FreeRTOS Personal Notes - Memory Management
Mini Program Graduation Works WeChat Second-hand Trading Mini Program Graduation Design Finished Work (2) Mini Program Function
Anti-shake and throttling
SQLSERVER merges subquery data into one field
day10--install mysql on linux
小程序毕设作品之微信积分商城小程序毕业设计成品(4)开题报告
【驱动】udev为USB转4串口的每个串口起别名
小程序毕设作品之微信积分商城小程序毕业设计成品(6)开题答辩PPT
WEB penetration of information collection
Eureka注册中心
Solve the problem of compiling and installing gdb-10.1 unistd.h:663:3: error: #error “Please include config.h first.”
Usage of exists in sql
Let's learn the layout components of flutter together
函数的底层机制
[ 云原生之谜 ] 云原生背景 && 定义 && 相关技术详解?
[Driver] udev sets the owner, group and permissions after GPIO is loaded
Flutter records and learns different animations (1)
How does the AI intelligent security video platform EasyCVR configure the simultaneous transmission of audio and video?
【驱动】udev设置GPIO加载后所有者、所属组和权限
第51篇-知乎请求头参数分析【2022-07-28】