old_flsystem/类库/HttpHelper2.1/Helper/HtmlHelper.cs

152 lines
4.1 KiB
C#
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using CsharpHttpHelper.Enum;
using CsharpHttpHelper.Item;
using CsharpHttpHelper.Static;
using System.Collections.Generic;
using System.Text.RegularExpressions;
namespace CsharpHttpHelper.Helper
{
/// <summary>
/// Html操作相关 Copyrighthttp://www.httphelper.com/
/// </summary>
internal class HtmlHelper
{
/// <summary>
/// 获取所有的A链接
/// </summary>
/// <param name="html">要分析的Html代码</param>
/// <returns>返回一个List存储所有的A标签</returns>
internal static List<AItem> GetAList(string html)
{
List<AItem> list = null;
string alist = RegexString.Alist;
if (Regex.IsMatch(html, alist, RegexOptions.IgnoreCase))
{
list = new List<AItem>();
foreach (Match item in Regex.Matches(html, alist, RegexOptions.IgnoreCase))
{
AItem aItem = null;
try
{
AItem aItem2 = new AItem();
aItem2.Href = item.Groups[1].Value;
aItem2.Text = item.Groups[2].Value;
aItem2.Html = item.Value;
aItem2.Type = AType.Text;
aItem = aItem2;
List<ImgItem> imgList = GetImgList(aItem.Text);
if (imgList != null && imgList.Count > 0)
{
aItem.Type = AType.Img;
aItem.Img = imgList[0];
}
}
catch
{
aItem = null;
}
if (aItem != null)
{
list.Add(aItem);
}
}
}
return list;
}
/// <summary>
/// 获取所有的Img标签
/// </summary>
/// <param name="html">要分析的Html代码</param>
/// <returns>返回一个List存储所有的Img标签</returns>
internal static List<ImgItem> GetImgList(string html)
{
List<ImgItem> list = null;
string imgList = RegexString.ImgList;
if (Regex.IsMatch(html, imgList, RegexOptions.IgnoreCase))
{
list = new List<ImgItem>();
foreach (Match item in Regex.Matches(html, imgList, RegexOptions.IgnoreCase))
{
ImgItem imgItem = null;
try
{
ImgItem imgItem2 = new ImgItem();
imgItem2.Src = item.Groups[1].Value;
imgItem2.Html = item.Value;
imgItem = imgItem2;
}
catch
{
imgItem = null;
}
if (imgItem != null)
{
list.Add(imgItem);
}
}
}
return list;
}
/// <summary>
/// 过滤html标签
/// </summary>
/// <param name="html">html的内容</param>
/// <returns>处理后的文本</returns>
internal static string StripHTML(string html)
{
html = Regex.Replace(html, RegexString.Nscript, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
html = Regex.Replace(html, RegexString.Style, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
html = Regex.Replace(html, RegexString.Script, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
html = Regex.Replace(html, RegexString.Html, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
return html;
}
/// <summary>
/// 过滤html中所有的换行符号
/// </summary>
/// <param name="html">html的内容</param>
/// <returns>处理后的文本</returns>
internal static string ReplaceNewLine(string html)
{
return Regex.Replace(html, RegexString.NewLine, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
}
/// <summary>
/// 提取Html字符串中两字符之间的数据
/// </summary>
/// <param name="html">源Html</param>
/// <param name="s">开始字符串</param>
/// <param name="e">结束字符串</param>
/// <returns></returns>
internal static string GetBetweenHtml(string html, string s, string e)
{
string pattern = $"{s}{RegexString.AllHtml}{e}";
if (Regex.IsMatch(html, pattern, RegexOptions.IgnoreCase))
{
Match match = Regex.Match(html, pattern, RegexOptions.IgnoreCase);
if (match != null && match.Groups.Count > 0)
{
return match.Groups[1].Value.Trim();
}
}
return string.Empty;
}
/// <summary>
/// 提取网页Title
/// </summary>
/// <param name="html">Html</param>
/// <returns>返回Title</returns>
internal static string GetHtmlTitle(string html)
{
if (Regex.IsMatch(html, RegexString.HtmlTitle))
{
return Regex.Match(html, RegexString.HtmlTitle).Groups[1].Value.Trim();
}
return string.Empty;
}
}
}