当前位置：网站首页>Web page element parsing a tag

Web page element parsing a tag

2022-07-30 04:33:00 【There is no way out of sailing against the current】

前言

Parsing of web page elements,This chapter focuses on how to parseA标签.

Examples of web elements（浏览器截取,供参考）

<div class="related-tags">
    <span>Related topics are recommended：</span>
      a<a href='http://www.csdn.net/tag/标签' target="_blank">标签</a>
      <a href="http://www.csdn.net/tag/java" target="_blank">java</a>
      <a href="http://www.csdn.net/tag/团购" target="_blank">团购</a>
      <a href="http://www.csdn.net/tag/体育" target="_blank">体育</a>
      <a href="http://www.csdn.net/tag/搜狐" icon='a' target="_blank">搜狐</a>
<img src="/i/eg_tulip2.jpg" alt="上海鲜花港 - 郁金香"><img src="/i/eg_tulip.jpg" alt="上海鲜花港 - 郁金香" />  </div>

如何解析

Web page elements are divided into symmetrical tags and non-stacked tags.

Symmetric labels eg<a>xxx</a>,Asymmetric labels eg<img/>

（1）So here it is defined as isSingleNode,Whether it is a single node label

/** * 网页节点 * * @author leng * */
public class WebNode implements Serializable {
    

	/** * Content in the middle of the node */
	private String content;

	/** * 参数 */
	private TreeMap<String, String> params = new TreeMap<>();

	/** * 标签类型 */
	private String labelType;

	/** * Whether it is a single node */
	private boolean isSingleNode;

	public String getContent() {
    
		return content;
	}

	public void setContent(String content) {
    
		this.content = content;
	}

	public TreeMap<String, String> getParams() {
    
		return params;
	}

	public void setParams(TreeMap<String, String> params) {
    
		this.params = params;
	}

	public String getLabelType() {
    
		return labelType;
	}

	public void setLabelType(String labelType) {
    
		this.labelType = labelType;
	}

	public boolean isSingleNode() {
    
		return isSingleNode;
	}

	public void setSingleNode(boolean isSingleNode) {
    
		this.isSingleNode = isSingleNode;
	}

}

（2）String manipulation class,辅助作用

/** * 字符串操作 * * @author leng * */
public class StrUtil {
    
	
	public static boolean isEmpty(String string){
    
		return null==string||string.equals("");
	}
	
	public static boolean isNotEmpty(String string){
    
		return null!=string&&!string.equals("");
	}

	/** * 将字符串string重复n次,通过segMake a split connection * * @param string Repeated strings are required * @param n 重复次数 * @param seg 分割字符串,如果等于null,则为"" * @return */
	public static String repeatString(String string, int n, String seg) {
    
		if (null == seg) {
    
			seg = "";
		}

		/** * Not without other methods to achieve,Because this method is the best way to write it */
		StringBuffer sb = new StringBuffer();
		for (int i = 0; i < n; i++) {
    
			sb.append(string).append(seg);
		}
		return sb.substring(0, sb.length() - seg.length());
		// return String.join(seg, Collections.nCopies(n, string));
	}

	/** * 将num重复n次,通过segMake a split symlink * * @param num Repeated values are required * @param n 重复次数 * @param seg 分割字符串,如果等于null,则为"" * @return */
	public static String repeatString(int num, int n, String seg) {
    
		return repeatString(num + "", n, seg);
	}

	/** * 将num重复n次,Split the string into commas"," * * @param num Repeated values are required * @param n 重复次数 * @param seg 分割字符串,如果等于null,则为"" * @return */
	public static String repeatString(int num, int n) {
    
		return repeatString(num + "", n, ",");
	}

	/** * 将int数组转换成字符串 * * @param intArray * @return */
	public static String IntArrayToString(Integer[] intArray) {
    
		if (null != intArray) {
    
			StringBuffer sb = new StringBuffer();
			for (Integer a : intArray) {
    
				if (null != a) {
    
					sb.append(",").append(a);
				}
			}

			if (sb.length() > 0) {
    
				return sb.substring(1);
			}
		}
		return null;
	}

	/** * 字符串str是否存在于字符串array数组中 * * @param array 字符串数组 * @param str 字符串 * @param ignoreCase 是否忽略大小写 * @return */
	public static boolean isExist(String[] array, String str, boolean ignoreCase) {
    
		if (null != array && null != str) {
    
			for (String s : array) {
    
				if (ignoreCase) {
    
					if (s.equalsIgnoreCase(str)) {
    
						return true;
					}
				} else {
    
					if (s.equals(str)) {
    
						return true;
					}
				}
			}
		}
		return false;
	}

	/** * 将字符串按照splitStr进行分割,并转换成{@code List<Integer>}集合 * * @param string * @param splitStr * @return */
	public static List<Integer> parseList(String string, String splitStr) {
    
		if (isEmpty(string)) {
    
			return null;
		}
		String[] arrays = string.split(splitStr);
		List<Integer> list = new ArrayList<>(arrays.length);
		for (String str : arrays) {
    
			try {
    
				if (isNotEmpty(str)) {
    
					Integer num = Integer.parseInt(str);
					list.add(num);
				}
			} catch (Exception e) {
    
			}
		}
		return list;
	}

	/** * Convert the string to commas,进行分割,并转换成{@code List<Integer>}集合 * * @param string * @return */
	public static List<Integer> parseList(String string) {
    
		return parseList(string, ",");
	}

	/** * 将字符串按照splitStr进行分割,并转换成{@code Set<Integer>}集合 * * @param string * @param splitStr * @return */
	public static Set<Integer> parseSet(String string, String splitStr) {
    
		if (isEmpty(string)) {
    
			return null;
		}
		String[] arrays = string.split(splitStr);
		Set<Integer> set = new HashSet<>();
		for (String str : arrays) {
    
			try {
    
				if (isNotEmpty(str)) {
    
					Integer num = Integer.parseInt(str);
					set.add(num);
				}
			} catch (Exception e) {
    
			}
		}
		return set;
	}

	/** * Convert the string to commas,进行分割,并转换成{@code Set<Integer>}集合 * * @param string * @return */
	public static Set<Integer> parseSet(String string) {
    
		return parseSet(string, ",");
	}

	/** * 将字符串按照splitStr进行分割,并转换成{@code Set<String>}集合 * * @param string * @param splitStr * @return */
	public static Set<String> parseSetString(String string, String splitStr) {
    
		if (isEmpty(string)) {
    
			return null;
		}
		String[] arrays = string.split(splitStr);
		Set<String> set = new HashSet<>();
		for (String str : arrays) {
    
			try {
    
				set.add(str);
			} catch (Exception e) {
    
			}
		}
		return set;
	}

	/** * 将字符串转换成map,示例 string="1-2,3-4,5-6",seg1=",",seg2="-" 转换为map后,{1=2,3=4,5=6} * * @param string 被切割的字符串 * @param seg1 分割符号1 * @param seg2 分割符号2 * @return */
	public static Map<String, String> stringToMap(String string, String seg1, String seg2) {
    
		if (isNotEmpty(string)) {
    
			String[] arr = string.split(seg1);
			Map<String, String> map = new HashMap<>(arr.length);
			for (String s : arr) {
    
				if (isNotEmpty(s)) {
    
					String[] arr2 = s.split(seg2);
					if (arr2.length == 1) {
    
						map.put(arr2[0], null);
					} else if (arr2.length >= 2) {
    
						map.put(arr2[0], arr2[1]);
					}
				}
			}
			return map;
		}
		return null;
	}

	/** * 将字符串转换成map,示例 string="1-2,3-4,5-6" 转换为map后,{1=2,3=4,5=6} * * @param string * @return */
	public static Map<String, String> stringToMap(String string) {
    
		return stringToMap(string, ",", "-");
	}

	/** * Replace the first comma,If the beginning of the string is not a comma,will be returned as is * * @param str * @return */
	public static String repaceFirstComma(String str) {
    
		if (null == str) {
    
			return null;
		}

		if (str.startsWith(",")) {
    
			return str.substring(1);
		}

		return str;
	}

	/** * 字符串数组转int数组 * * @param array * @return */
	public static int[] strArrayToIntArray(String[] array) {
    
		int[] intArray = new int[0];
		for (String s : array) {
    
			if (isNotEmpty(s)) {
    
				intArray = Arrays.copyOf(intArray, intArray.length + 1);
				intArray[intArray.length - 1] = Integer.parseInt(s);
			}
		}
		return intArray;
	}

	/** * Remove spaces before and after the specified character * * @param string * @param specialStr * @return */
	public static String trim(String string, String specialStr) {
    
		if (null == string) {
    
			return null;
		}

		String s1 = " " + specialStr;
		while (string.indexOf(s1) > -1) {
    
			string = string.replaceAll(s1, specialStr);
		}

		String s2 = specialStr + " ";
		while (string.indexOf(s2) > -1) {
    
			string = string.replaceAll(s2, specialStr);
		}

		return string;
	}

	public static void main(String[] args) {
    
		int times = 300_000_000;

	}

}

（3）core analysis

/** * 网页工具 * * @author leng * */
public class WebpageUtil {
    

	/** * Get the title from the web page * * @param html * @return */
	public static String getTitleFromHtmlString(String html) {
    
		int startIndex = html.indexOf("<title>");
		int endIndex = html.indexOf("</title>");
		if (startIndex == -1 || endIndex == -1) {
    
			return null;
		}

		String title = html.substring(startIndex + ("<title>".length()), endIndex);
		return title;
	}

	/** * 获取节点(Nodes contained within nodes are not supported) * * @param html * @return */
	public static List<String> getNode(String html, String nodeType) {
    
		String endStr = "</" + nodeType + ">";
		String regex;
		if (html.indexOf(endStr) > -1) {
    
			regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'])*>(.*?)</a>";// Elements of paired labels(例如<a></a>)
		} else {
    
			regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'|\\s])*\\s*\\/?>";// A single-label element(例如<img>)
		}

		regex = regex.replaceAll("a", nodeType);

		Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
		Matcher m = p.matcher(html);
		ArrayList<String> list = new ArrayList<String>();
		while (m.find()) {
    
			String link = m.group(0).trim();
			list.add(link);
		}

		return list;
	}

	/** * 获取节点 * * @param html * @return */
	public static List<WebNode> getNode2(String html, String labelType) {
    
		String endStr = "</" + labelType + ">";
		String regex;
		boolean isSingleNode = false;
		if (html.indexOf(endStr) > -1) {
    
			regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'])*>(.*?)</a>";// Elements of paired labels(例如<a></a>)
		} else {
    
			isSingleNode = true;
			regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'|\\s])*\\s*\\/?>";// A single-label element(例如<img>)
		}

		regex = regex.replaceAll("a", labelType);

		Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
		Matcher m = p.matcher(html);
		List<WebNode> list = new ArrayList<>();
		while (m.find()) {
    
			WebNode node = new WebNode();
			node.setLabelType(labelType);
			node.setSingleNode(isSingleNode);

			String link = m.group(0).trim();

			if (!isSingleNode) {
    
				int startIndex = link.substring(0, link.length() - 2).lastIndexOf(">");
				int endIndex = link.lastIndexOf("<");
				if (startIndex > -1 && endIndex > -1 && endIndex > startIndex) {
    
					String content = link.substring(startIndex + 1, endIndex);
					node.setContent(content);
				}
			}

			int startIndex2 = link.indexOf("<" + labelType);

			String str = null;
			if (isSingleNode) {
    
				str = link.substring(startIndex2 + ("<" + labelType).length());
			} else {
    
				int startIndex = link.substring(0, link.length() - 2).lastIndexOf(">");
				str = link.substring(startIndex2 + ("<" + labelType).length(), startIndex);
			}

			String[] arr = StrUtil.trim(str, "=").split(" ");
			if (null != arr && arr.length > 0) {
    
				TreeMap<String, String> params = new TreeMap<>();

				for (String s : arr) {
    
					if (s.indexOf("=") > -1) {
    
						String key = s.substring(0, s.indexOf("="));
						String value = s.substring(s.indexOf("=") + 1);
						String k = key, v = value;
						if (key.startsWith("\'") || key.startsWith("\"")) {
    
							k = key.substring(1, key.length() - 1).trim();
						}

						if (value.startsWith("\'") || value.startsWith("\"")) {
    
							v = value.substring(1, value.length() - 1).trim();
						}
						params.put(k, v);
					}
				}
				node.setParams(params);
			}

			list.add(node);
		}

		return list;
	}

}

（4）测试

public static void main(String[] args) {
    
		String str = "<div class=\"related-tags\">\n" + " <span>Related topics are recommended：</span>\n"
				+ " a<a href=\'http://www.csdn.net/tag/标签\' target=\"_blank\">标签</a>\n"
				+ " <a href=\"http://www.csdn.net/tag/java\" target=\"_blank\">java</a>\n"
				+ " <a href=\"http://www.csdn.net/tag/团购\" target=\"_blank\">团购</a>\n"
				+ " <a href=\"http://www.csdn.net/tag/体育\" target=\"_blank\">体育</a>\n"
				+ " <a href=\"http://www.csdn.net/tag/搜狐\" icon='a' target=\"_blank\">搜狐</a>\n"
				+ "<img src=\"/i/eg_tulip2.jpg\" alt=\"上海鲜花港 - 郁金香\"><img src=\"/i/eg_tulip.jpg\" alt=\"上海鲜花港 - 郁金香\" /> </div>";
		
		
		List<String> list = WebpageUtil.getNode(str, "a");
		System.out.println("-------aThe original content of the label------");
		list.forEach(c -> {
    
			System.out.println(c);
		});
		List<WebNode> list2 = WebpageUtil.getNode2(str, "a");
		System.out.println("-------解析后的数据------");
		list2.forEach(c -> {
    
			System.out.println("内容:"+c.getContent());
			System.out.println("标签类型:"+c.getLabelType());
			System.out.println("标签的参数:"+c.getParams());
		});
	}