using CsharpHttpHelper.Enum; using CsharpHttpHelper.Item; using CsharpHttpHelper.Static; using System.Collections.Generic; using System.Text.RegularExpressions; namespace CsharpHttpHelper.Helper { /// /// Html操作相关 Copyright:http://www.httphelper.com/ /// internal class HtmlHelper { /// /// 获取所有的A链接 /// /// 要分析的Html代码 /// 返回一个List存储所有的A标签 internal static List GetAList(string html) { List list = null; string alist = RegexString.Alist; if (Regex.IsMatch(html, alist, RegexOptions.IgnoreCase)) { list = new List(); foreach (Match item in Regex.Matches(html, alist, RegexOptions.IgnoreCase)) { AItem aItem = null; try { AItem aItem2 = new AItem(); aItem2.Href = item.Groups[1].Value; aItem2.Text = item.Groups[2].Value; aItem2.Html = item.Value; aItem2.Type = AType.Text; aItem = aItem2; List imgList = GetImgList(aItem.Text); if (imgList != null && imgList.Count > 0) { aItem.Type = AType.Img; aItem.Img = imgList[0]; } } catch { aItem = null; } if (aItem != null) { list.Add(aItem); } } } return list; } /// /// 获取所有的Img标签 /// /// 要分析的Html代码 /// 返回一个List存储所有的Img标签 internal static List GetImgList(string html) { List list = null; string imgList = RegexString.ImgList; if (Regex.IsMatch(html, imgList, RegexOptions.IgnoreCase)) { list = new List(); foreach (Match item in Regex.Matches(html, imgList, RegexOptions.IgnoreCase)) { ImgItem imgItem = null; try { ImgItem imgItem2 = new ImgItem(); imgItem2.Src = item.Groups[1].Value; imgItem2.Html = item.Value; imgItem = imgItem2; } catch { imgItem = null; } if (imgItem != null) { list.Add(imgItem); } } } return list; } /// /// 过滤html标签 /// /// html的内容 /// 处理后的文本 internal static string StripHTML(string html) { html = Regex.Replace(html, RegexString.Nscript, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled); html = Regex.Replace(html, RegexString.Style, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled); html = Regex.Replace(html, RegexString.Script, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled); html = Regex.Replace(html, RegexString.Html, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled); return html; } /// /// 过滤html中所有的换行符号 /// /// html的内容 /// 处理后的文本 internal static string ReplaceNewLine(string html) { return Regex.Replace(html, RegexString.NewLine, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled); } /// /// 提取Html字符串中两字符之间的数据 /// /// 源Html /// 开始字符串 /// 结束字符串 /// internal static string GetBetweenHtml(string html, string s, string e) { string pattern = $"{s}{RegexString.AllHtml}{e}"; if (Regex.IsMatch(html, pattern, RegexOptions.IgnoreCase)) { Match match = Regex.Match(html, pattern, RegexOptions.IgnoreCase); if (match != null && match.Groups.Count > 0) { return match.Groups[1].Value.Trim(); } } return string.Empty; } /// /// 提取网页Title /// /// Html /// 返回Title internal static string GetHtmlTitle(string html) { if (Regex.IsMatch(html, RegexString.HtmlTitle)) { return Regex.Match(html, RegexString.HtmlTitle).Groups[1].Value.Trim(); } return string.Empty; } } }