当前位置:网站首页>网页元素解析a标签
网页元素解析a标签
2022-07-30 04:10:00 【逆水行舟没有退路】
前言
网页元素的解析,本章重点如何解析A标签。
网页元素示例(浏览器截取,供参考)
<div class="related-tags">
<span>相关主题推荐:</span>
a<a href='http://www.csdn.net/tag/标签' target="_blank">标签</a>
<a href="http://www.csdn.net/tag/java" target="_blank">java</a>
<a href="http://www.csdn.net/tag/团购" target="_blank">团购</a>
<a href="http://www.csdn.net/tag/体育" target="_blank">体育</a>
<a href="http://www.csdn.net/tag/搜狐" icon='a' target="_blank">搜狐</a>
<img src="/i/eg_tulip2.jpg" alt="上海鲜花港 - 郁金香"><img src="/i/eg_tulip.jpg" alt="上海鲜花港 - 郁金香" /> </div>
如何解析
网页元素分为对称的标签和非堆成标签。
对称标签如<a>xxx</a>,非对称标签如<img/>
(1)因此此处定义为isSingleNode,是否为单节点标签
/** * 网页节点 * * @author leng * */
public class WebNode implements Serializable {
/** * 节点中间的内容 */
private String content;
/** * 参数 */
private TreeMap<String, String> params = new TreeMap<>();
/** * 标签类型 */
private String labelType;
/** * 是否单节点 */
private boolean isSingleNode;
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public TreeMap<String, String> getParams() {
return params;
}
public void setParams(TreeMap<String, String> params) {
this.params = params;
}
public String getLabelType() {
return labelType;
}
public void setLabelType(String labelType) {
this.labelType = labelType;
}
public boolean isSingleNode() {
return isSingleNode;
}
public void setSingleNode(boolean isSingleNode) {
this.isSingleNode = isSingleNode;
}
}
(2)字符串的操作类,辅助作用
/** * 字符串操作 * * @author leng * */
public class StrUtil {
public static boolean isEmpty(String string){
return null==string||string.equals("");
}
public static boolean isNotEmpty(String string){
return null!=string&&!string.equals("");
}
/** * 将字符串string重复n次,通过seg进行分割连接 * * @param string 需要重复的字符串 * @param n 重复次数 * @param seg 分割字符串,如果等于null,则为"" * @return */
public static String repeatString(String string, int n, String seg) {
if (null == seg) {
seg = "";
}
/** * 并非不用其他方法实现,因为这种方法是性能最好的写法 */
StringBuffer sb = new StringBuffer();
for (int i = 0; i < n; i++) {
sb.append(string).append(seg);
}
return sb.substring(0, sb.length() - seg.length());
// return String.join(seg, Collections.nCopies(n, string));
}
/** * 将num重复n次,通过seg进行分割符号连接 * * @param num 需要重复的数值 * @param n 重复次数 * @param seg 分割字符串,如果等于null,则为"" * @return */
public static String repeatString(int num, int n, String seg) {
return repeatString(num + "", n, seg);
}
/** * 将num重复n次,分割字符串为英文逗号"," * * @param num 需要重复的数值 * @param n 重复次数 * @param seg 分割字符串,如果等于null,则为"" * @return */
public static String repeatString(int num, int n) {
return repeatString(num + "", n, ",");
}
/** * 将int数组转换成字符串 * * @param intArray * @return */
public static String IntArrayToString(Integer[] intArray) {
if (null != intArray) {
StringBuffer sb = new StringBuffer();
for (Integer a : intArray) {
if (null != a) {
sb.append(",").append(a);
}
}
if (sb.length() > 0) {
return sb.substring(1);
}
}
return null;
}
/** * 字符串str是否存在于字符串array数组中 * * @param array 字符串数组 * @param str 字符串 * @param ignoreCase 是否忽略大小写 * @return */
public static boolean isExist(String[] array, String str, boolean ignoreCase) {
if (null != array && null != str) {
for (String s : array) {
if (ignoreCase) {
if (s.equalsIgnoreCase(str)) {
return true;
}
} else {
if (s.equals(str)) {
return true;
}
}
}
}
return false;
}
/** * 将字符串按照splitStr进行分割,并转换成{@code List<Integer>}集合 * * @param string * @param splitStr * @return */
public static List<Integer> parseList(String string, String splitStr) {
if (isEmpty(string)) {
return null;
}
String[] arrays = string.split(splitStr);
List<Integer> list = new ArrayList<>(arrays.length);
for (String str : arrays) {
try {
if (isNotEmpty(str)) {
Integer num = Integer.parseInt(str);
list.add(num);
}
} catch (Exception e) {
}
}
return list;
}
/** * 将字符串按照逗号,进行分割,并转换成{@code List<Integer>}集合 * * @param string * @return */
public static List<Integer> parseList(String string) {
return parseList(string, ",");
}
/** * 将字符串按照splitStr进行分割,并转换成{@code Set<Integer>}集合 * * @param string * @param splitStr * @return */
public static Set<Integer> parseSet(String string, String splitStr) {
if (isEmpty(string)) {
return null;
}
String[] arrays = string.split(splitStr);
Set<Integer> set = new HashSet<>();
for (String str : arrays) {
try {
if (isNotEmpty(str)) {
Integer num = Integer.parseInt(str);
set.add(num);
}
} catch (Exception e) {
}
}
return set;
}
/** * 将字符串按照逗号,进行分割,并转换成{@code Set<Integer>}集合 * * @param string * @return */
public static Set<Integer> parseSet(String string) {
return parseSet(string, ",");
}
/** * 将字符串按照splitStr进行分割,并转换成{@code Set<String>}集合 * * @param string * @param splitStr * @return */
public static Set<String> parseSetString(String string, String splitStr) {
if (isEmpty(string)) {
return null;
}
String[] arrays = string.split(splitStr);
Set<String> set = new HashSet<>();
for (String str : arrays) {
try {
set.add(str);
} catch (Exception e) {
}
}
return set;
}
/** * 将字符串转换成map,示例 string="1-2,3-4,5-6",seg1=",",seg2="-" 转换为map后,{1=2,3=4,5=6} * * @param string 被切割的字符串 * @param seg1 分割符号1 * @param seg2 分割符号2 * @return */
public static Map<String, String> stringToMap(String string, String seg1, String seg2) {
if (isNotEmpty(string)) {
String[] arr = string.split(seg1);
Map<String, String> map = new HashMap<>(arr.length);
for (String s : arr) {
if (isNotEmpty(s)) {
String[] arr2 = s.split(seg2);
if (arr2.length == 1) {
map.put(arr2[0], null);
} else if (arr2.length >= 2) {
map.put(arr2[0], arr2[1]);
}
}
}
return map;
}
return null;
}
/** * 将字符串转换成map,示例 string="1-2,3-4,5-6" 转换为map后,{1=2,3=4,5=6} * * @param string * @return */
public static Map<String, String> stringToMap(String string) {
return stringToMap(string, ",", "-");
}
/** * 替换首逗号,如果字符串的开头不是逗号,则原样返还 * * @param str * @return */
public static String repaceFirstComma(String str) {
if (null == str) {
return null;
}
if (str.startsWith(",")) {
return str.substring(1);
}
return str;
}
/** * 字符串数组转int数组 * * @param array * @return */
public static int[] strArrayToIntArray(String[] array) {
int[] intArray = new int[0];
for (String s : array) {
if (isNotEmpty(s)) {
intArray = Arrays.copyOf(intArray, intArray.length + 1);
intArray[intArray.length - 1] = Integer.parseInt(s);
}
}
return intArray;
}
/** * 去掉指定字符前后的空格 * * @param string * @param specialStr * @return */
public static String trim(String string, String specialStr) {
if (null == string) {
return null;
}
String s1 = " " + specialStr;
while (string.indexOf(s1) > -1) {
string = string.replaceAll(s1, specialStr);
}
String s2 = specialStr + " ";
while (string.indexOf(s2) > -1) {
string = string.replaceAll(s2, specialStr);
}
return string;
}
public static void main(String[] args) {
int times = 300_000_000;
}
}
(3)核心的解析
/** * 网页工具 * * @author leng * */
public class WebpageUtil {
/** * 从网页中获取标题 * * @param html * @return */
public static String getTitleFromHtmlString(String html) {
int startIndex = html.indexOf("<title>");
int endIndex = html.indexOf("</title>");
if (startIndex == -1 || endIndex == -1) {
return null;
}
String title = html.substring(startIndex + ("<title>".length()), endIndex);
return title;
}
/** * 获取节点(不支持节点内包含节点) * * @param html * @return */
public static List<String> getNode(String html, String nodeType) {
String endStr = "</" + nodeType + ">";
String regex;
if (html.indexOf(endStr) > -1) {
regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'])*>(.*?)</a>";// 成对标签的元素(例如<a></a>)
} else {
regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'|\\s])*\\s*\\/?>";// 单标签的元素(例如<img>)
}
regex = regex.replaceAll("a", nodeType);
Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(html);
ArrayList<String> list = new ArrayList<String>();
while (m.find()) {
String link = m.group(0).trim();
list.add(link);
}
return list;
}
/** * 获取节点 * * @param html * @return */
public static List<WebNode> getNode2(String html, String labelType) {
String endStr = "</" + labelType + ">";
String regex;
boolean isSingleNode = false;
if (html.indexOf(endStr) > -1) {
regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'])*>(.*?)</a>";// 成对标签的元素(例如<a></a>)
} else {
isSingleNode = true;
regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'|\\s])*\\s*\\/?>";// 单标签的元素(例如<img>)
}
regex = regex.replaceAll("a", labelType);
Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(html);
List<WebNode> list = new ArrayList<>();
while (m.find()) {
WebNode node = new WebNode();
node.setLabelType(labelType);
node.setSingleNode(isSingleNode);
String link = m.group(0).trim();
if (!isSingleNode) {
int startIndex = link.substring(0, link.length() - 2).lastIndexOf(">");
int endIndex = link.lastIndexOf("<");
if (startIndex > -1 && endIndex > -1 && endIndex > startIndex) {
String content = link.substring(startIndex + 1, endIndex);
node.setContent(content);
}
}
int startIndex2 = link.indexOf("<" + labelType);
String str = null;
if (isSingleNode) {
str = link.substring(startIndex2 + ("<" + labelType).length());
} else {
int startIndex = link.substring(0, link.length() - 2).lastIndexOf(">");
str = link.substring(startIndex2 + ("<" + labelType).length(), startIndex);
}
String[] arr = StrUtil.trim(str, "=").split(" ");
if (null != arr && arr.length > 0) {
TreeMap<String, String> params = new TreeMap<>();
for (String s : arr) {
if (s.indexOf("=") > -1) {
String key = s.substring(0, s.indexOf("="));
String value = s.substring(s.indexOf("=") + 1);
String k = key, v = value;
if (key.startsWith("\'") || key.startsWith("\"")) {
k = key.substring(1, key.length() - 1).trim();
}
if (value.startsWith("\'") || value.startsWith("\"")) {
v = value.substring(1, value.length() - 1).trim();
}
params.put(k, v);
}
}
node.setParams(params);
}
list.add(node);
}
return list;
}
}
(4)测试
public static void main(String[] args) {
String str = "<div class=\"related-tags\">\n" + " <span>相关主题推荐:</span>\n"
+ " a<a href=\'http://www.csdn.net/tag/标签\' target=\"_blank\">标签</a>\n"
+ " <a href=\"http://www.csdn.net/tag/java\" target=\"_blank\">java</a>\n"
+ " <a href=\"http://www.csdn.net/tag/团购\" target=\"_blank\">团购</a>\n"
+ " <a href=\"http://www.csdn.net/tag/体育\" target=\"_blank\">体育</a>\n"
+ " <a href=\"http://www.csdn.net/tag/搜狐\" icon='a' target=\"_blank\">搜狐</a>\n"
+ "<img src=\"/i/eg_tulip2.jpg\" alt=\"上海鲜花港 - 郁金香\"><img src=\"/i/eg_tulip.jpg\" alt=\"上海鲜花港 - 郁金香\" /> </div>";
List<String> list = WebpageUtil.getNode(str, "a");
System.out.println("-------a标签原内容------");
list.forEach(c -> {
System.out.println(c);
});
List<WebNode> list2 = WebpageUtil.getNode2(str, "a");
System.out.println("-------解析后的数据------");
list2.forEach(c -> {
System.out.println("内容:"+c.getContent());
System.out.println("标签类型:"+c.getLabelType());
System.out.println("标签的参数:"+c.getParams());
});
}
执行后输出,可以看到网页div里的a标签信息被解析出来。
边栏推荐
- The difference between forward and redirect
- How to extract year, month and day data in date type in SQL Server
- Roperties类配置文件&DOS查看主机网络情况
- ospf 综合实验(重发布,特殊区域)
- Anti-shake and throttling
- JQ源码分析(环境处理)
- Pytorch框架学习记录4——数据集的使用(torchvision.dataset)
- Mini Program Graduation Works WeChat Second-hand Trading Mini Program Graduation Design Finished Works (5) Task Book
- state space representation
- Pytorch framework learning record 1 - Dataset class code combat
猜你喜欢
Has been empty, a straightforward, continue to copy the top off!
Pytorch framework learning record 5 - the use of DataLoader
Data Lake: Data Integration Tool DataX
Basic introduction to protect the network operations
(6) "Digital Electricity" - Diodes and CMOS Gate Circuits (Introduction)
Mini Program Graduation Works WeChat Second-hand Trading Mini Program Graduation Design Finished Works (5) Task Book
Resampling a uniformly sampled signal
Detailed transport layer
智能答题功能,CRMEB知识付费系统必须有!
[The Mystery of Cloud Native] Cloud Native Background && Definition && Detailed explanation of related technologies?
随机推荐
Mini Program Graduation Works WeChat Points Mall Mini Program Graduation Design Finished Products (3) Background Functions
【驱动】udev设置GPIO加载后所有者、所属组和权限
Pytorch framework learning record 5 - the use of DataLoader
Basic introduction to protect the network operations
Pytorch框架学习记录6——torch.nn.Module和torch.nn.functional.conv2d的使用
The difference between forward and redirect
Roperties类配置文件&DOS查看主机网络情况
防抖与节流
Flutter record learning different animation (2)
Mini Program Graduation Works WeChat Second-hand Trading Mini Program Graduation Design Finished Works (5) Task Book
WEB penetration of information collection
FreeRTOS Personal Notes - Memory Management
Summary of Rpc and gRpc Introduction
spicy(二)unit hooks
Taobao/Tmall get the list of sold product orders API
phpoffice edit excel document
WeChat second-hand transaction small program graduation design finished works (8) graduation design thesis template
Mini Program Graduation Works WeChat Second-hand Trading Mini Program Graduation Design Finished Works (7) Interim Inspection Report
Usage of exists in sql
How to solve the error "no such file or directory" when EasyCVR starts?