old_flsystem/类库/HttpHelper2.1/Helper/HtmlHelper.cs

using CsharpHttpHelper.Enum;
using CsharpHttpHelper.Item;
using CsharpHttpHelper.Static;
using System.Collections.Generic;
using System.Text.RegularExpressions;

namespace CsharpHttpHelper.Helper
{
	/// <summary>
	/// Html操作相关  Copyright：http://www.httphelper.com/
	/// </summary>
	internal class HtmlHelper
	{
		/// <summary>
		/// 获取所有的A链接
		/// </summary>
		/// <param name="html">要分析的Html代码</param>
		/// <returns>返回一个List存储所有的A标签</returns>
		internal static List<AItem> GetAList(string html)
		{
			List<AItem> list = null;
			string alist = RegexString.Alist;
			if (Regex.IsMatch(html, alist, RegexOptions.IgnoreCase))
			{
				list = new List<AItem>();
				foreach (Match item in Regex.Matches(html, alist, RegexOptions.IgnoreCase))
				{
					AItem aItem = null;
					try
					{
						AItem aItem2 = new AItem();
						aItem2.Href = item.Groups[1].Value;
						aItem2.Text = item.Groups[2].Value;
						aItem2.Html = item.Value;
						aItem2.Type = AType.Text;
						aItem = aItem2;
						List<ImgItem> imgList = GetImgList(aItem.Text);
						if (imgList != null && imgList.Count > 0)
						{
							aItem.Type = AType.Img;
							aItem.Img = imgList[0];
						}
					}
					catch
					{
						aItem = null;
					}
					if (aItem != null)
					{
						list.Add(aItem);
					}
				}
			}
			return list;
		}

		/// <summary>
		/// 获取所有的Img标签
		/// </summary>
		/// <param name="html">要分析的Html代码</param>
		/// <returns>返回一个List存储所有的Img标签</returns>
		internal static List<ImgItem> GetImgList(string html)
		{
			List<ImgItem> list = null;
			string imgList = RegexString.ImgList;
			if (Regex.IsMatch(html, imgList, RegexOptions.IgnoreCase))
			{
				list = new List<ImgItem>();
				foreach (Match item in Regex.Matches(html, imgList, RegexOptions.IgnoreCase))
				{
					ImgItem imgItem = null;
					try
					{
						ImgItem imgItem2 = new ImgItem();
						imgItem2.Src = item.Groups[1].Value;
						imgItem2.Html = item.Value;
						imgItem = imgItem2;
					}
					catch
					{
						imgItem = null;
					}
					if (imgItem != null)
					{
						list.Add(imgItem);
					}
				}
			}
			return list;
		}

		/// <summary>
		/// 过滤html标签
		/// </summary>
		/// <param name="html">html的内容</param>
		/// <returns>处理后的文本</returns>
		internal static string StripHTML(string html)
		{
			html = Regex.Replace(html, RegexString.Nscript, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
			html = Regex.Replace(html, RegexString.Style, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
			html = Regex.Replace(html, RegexString.Script, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
			html = Regex.Replace(html, RegexString.Html, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
			return html;
		}

		/// <summary>
		/// 过滤html中所有的换行符号
		/// </summary>
		/// <param name="html">html的内容</param>
		/// <returns>处理后的文本</returns>
		internal static string ReplaceNewLine(string html)
		{
			return Regex.Replace(html, RegexString.NewLine, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
		}

		/// <summary>
		/// 提取Html字符串中两字符之间的数据
		/// </summary>
		/// <param name="html">源Html</param>
		/// <param name="s">开始字符串</param>
		/// <param name="e">结束字符串</param>
		/// <returns></returns>
		internal static string GetBetweenHtml(string html, string s, string e)
		{
			string pattern = $"{s}{RegexString.AllHtml}{e}";
			if (Regex.IsMatch(html, pattern, RegexOptions.IgnoreCase))
			{
				Match match = Regex.Match(html, pattern, RegexOptions.IgnoreCase);
				if (match != null && match.Groups.Count > 0)
				{
					return match.Groups[1].Value.Trim();
				}
			}
			return string.Empty;
		}

		/// <summary>
		/// 提取网页Title
		/// </summary>
		/// <param name="html">Html</param>
		/// <returns>返回Title</returns>
		internal static string GetHtmlTitle(string html)
		{
			if (Regex.IsMatch(html, RegexString.HtmlTitle))
			{
				return Regex.Match(html, RegexString.HtmlTitle).Groups[1].Value.Trim();
			}
			return string.Empty;
		}
	}
}