old_flsystem/类库/HttpHelper2.1/Helper/HtmlHelper.cs

152 lines
4.1 KiB
C#
Raw Permalink Normal View History

2022-09-20 03:10:29 +00:00
using CsharpHttpHelper.Enum;
using CsharpHttpHelper.Item;
using CsharpHttpHelper.Static;
using System.Collections.Generic;
using System.Text.RegularExpressions;
namespace CsharpHttpHelper.Helper
{
/// <summary>
/// Html操作相关 Copyrighthttp://www.httphelper.com/
/// </summary>
internal class HtmlHelper
{
/// <summary>
/// 获取所有的A链接
/// </summary>
/// <param name="html">要分析的Html代码</param>
/// <returns>返回一个List存储所有的A标签</returns>
internal static List<AItem> GetAList(string html)
{
List<AItem> list = null;
string alist = RegexString.Alist;
if (Regex.IsMatch(html, alist, RegexOptions.IgnoreCase))
{
list = new List<AItem>();
foreach (Match item in Regex.Matches(html, alist, RegexOptions.IgnoreCase))
{
AItem aItem = null;
try
{
AItem aItem2 = new AItem();
aItem2.Href = item.Groups[1].Value;
aItem2.Text = item.Groups[2].Value;
aItem2.Html = item.Value;
aItem2.Type = AType.Text;
aItem = aItem2;
List<ImgItem> imgList = GetImgList(aItem.Text);
if (imgList != null && imgList.Count > 0)
{
aItem.Type = AType.Img;
aItem.Img = imgList[0];
}
}
catch
{
aItem = null;
}
if (aItem != null)
{
list.Add(aItem);
}
}
}
return list;
}
/// <summary>
/// 获取所有的Img标签
/// </summary>
/// <param name="html">要分析的Html代码</param>
/// <returns>返回一个List存储所有的Img标签</returns>
internal static List<ImgItem> GetImgList(string html)
{
List<ImgItem> list = null;
string imgList = RegexString.ImgList;
if (Regex.IsMatch(html, imgList, RegexOptions.IgnoreCase))
{
list = new List<ImgItem>();
foreach (Match item in Regex.Matches(html, imgList, RegexOptions.IgnoreCase))
{
ImgItem imgItem = null;
try
{
ImgItem imgItem2 = new ImgItem();
imgItem2.Src = item.Groups[1].Value;
imgItem2.Html = item.Value;
imgItem = imgItem2;
}
catch
{
imgItem = null;
}
if (imgItem != null)
{
list.Add(imgItem);
}
}
}
return list;
}
/// <summary>
/// 过滤html标签
/// </summary>
/// <param name="html">html的内容</param>
/// <returns>处理后的文本</returns>
internal static string StripHTML(string html)
{
html = Regex.Replace(html, RegexString.Nscript, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
html = Regex.Replace(html, RegexString.Style, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
html = Regex.Replace(html, RegexString.Script, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
html = Regex.Replace(html, RegexString.Html, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
return html;
}
/// <summary>
/// 过滤html中所有的换行符号
/// </summary>
/// <param name="html">html的内容</param>
/// <returns>处理后的文本</returns>
internal static string ReplaceNewLine(string html)
{
return Regex.Replace(html, RegexString.NewLine, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
}
/// <summary>
/// 提取Html字符串中两字符之间的数据
/// </summary>
/// <param name="html">源Html</param>
/// <param name="s">开始字符串</param>
/// <param name="e">结束字符串</param>
/// <returns></returns>
internal static string GetBetweenHtml(string html, string s, string e)
{
string pattern = $"{s}{RegexString.AllHtml}{e}";
if (Regex.IsMatch(html, pattern, RegexOptions.IgnoreCase))
{
Match match = Regex.Match(html, pattern, RegexOptions.IgnoreCase);
if (match != null && match.Groups.Count > 0)
{
return match.Groups[1].Value.Trim();
}
}
return string.Empty;
}
/// <summary>
/// 提取网页Title
/// </summary>
/// <param name="html">Html</param>
/// <returns>返回Title</returns>
internal static string GetHtmlTitle(string html)
{
if (Regex.IsMatch(html, RegexString.HtmlTitle))
{
return Regex.Match(html, RegexString.HtmlTitle).Groups[1].Value.Trim();
}
return string.Empty;
}
}
}