152 lines
4.1 KiB
C#
152 lines
4.1 KiB
C#
using CsharpHttpHelper.Enum;
|
||
using CsharpHttpHelper.Item;
|
||
using CsharpHttpHelper.Static;
|
||
using System.Collections.Generic;
|
||
using System.Text.RegularExpressions;
|
||
|
||
namespace CsharpHttpHelper.Helper
|
||
{
|
||
/// <summary>
|
||
/// Html操作相关 Copyright:http://www.httphelper.com/
|
||
/// </summary>
|
||
internal class HtmlHelper
|
||
{
|
||
/// <summary>
|
||
/// 获取所有的A链接
|
||
/// </summary>
|
||
/// <param name="html">要分析的Html代码</param>
|
||
/// <returns>返回一个List存储所有的A标签</returns>
|
||
internal static List<AItem> GetAList(string html)
|
||
{
|
||
List<AItem> list = null;
|
||
string alist = RegexString.Alist;
|
||
if (Regex.IsMatch(html, alist, RegexOptions.IgnoreCase))
|
||
{
|
||
list = new List<AItem>();
|
||
foreach (Match item in Regex.Matches(html, alist, RegexOptions.IgnoreCase))
|
||
{
|
||
AItem aItem = null;
|
||
try
|
||
{
|
||
AItem aItem2 = new AItem();
|
||
aItem2.Href = item.Groups[1].Value;
|
||
aItem2.Text = item.Groups[2].Value;
|
||
aItem2.Html = item.Value;
|
||
aItem2.Type = AType.Text;
|
||
aItem = aItem2;
|
||
List<ImgItem> imgList = GetImgList(aItem.Text);
|
||
if (imgList != null && imgList.Count > 0)
|
||
{
|
||
aItem.Type = AType.Img;
|
||
aItem.Img = imgList[0];
|
||
}
|
||
}
|
||
catch
|
||
{
|
||
aItem = null;
|
||
}
|
||
if (aItem != null)
|
||
{
|
||
list.Add(aItem);
|
||
}
|
||
}
|
||
}
|
||
return list;
|
||
}
|
||
|
||
/// <summary>
|
||
/// 获取所有的Img标签
|
||
/// </summary>
|
||
/// <param name="html">要分析的Html代码</param>
|
||
/// <returns>返回一个List存储所有的Img标签</returns>
|
||
internal static List<ImgItem> GetImgList(string html)
|
||
{
|
||
List<ImgItem> list = null;
|
||
string imgList = RegexString.ImgList;
|
||
if (Regex.IsMatch(html, imgList, RegexOptions.IgnoreCase))
|
||
{
|
||
list = new List<ImgItem>();
|
||
foreach (Match item in Regex.Matches(html, imgList, RegexOptions.IgnoreCase))
|
||
{
|
||
ImgItem imgItem = null;
|
||
try
|
||
{
|
||
ImgItem imgItem2 = new ImgItem();
|
||
imgItem2.Src = item.Groups[1].Value;
|
||
imgItem2.Html = item.Value;
|
||
imgItem = imgItem2;
|
||
}
|
||
catch
|
||
{
|
||
imgItem = null;
|
||
}
|
||
if (imgItem != null)
|
||
{
|
||
list.Add(imgItem);
|
||
}
|
||
}
|
||
}
|
||
return list;
|
||
}
|
||
|
||
/// <summary>
|
||
/// 过滤html标签
|
||
/// </summary>
|
||
/// <param name="html">html的内容</param>
|
||
/// <returns>处理后的文本</returns>
|
||
internal static string StripHTML(string html)
|
||
{
|
||
html = Regex.Replace(html, RegexString.Nscript, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||
html = Regex.Replace(html, RegexString.Style, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||
html = Regex.Replace(html, RegexString.Script, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||
html = Regex.Replace(html, RegexString.Html, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||
return html;
|
||
}
|
||
|
||
/// <summary>
|
||
/// 过滤html中所有的换行符号
|
||
/// </summary>
|
||
/// <param name="html">html的内容</param>
|
||
/// <returns>处理后的文本</returns>
|
||
internal static string ReplaceNewLine(string html)
|
||
{
|
||
return Regex.Replace(html, RegexString.NewLine, string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||
}
|
||
|
||
/// <summary>
|
||
/// 提取Html字符串中两字符之间的数据
|
||
/// </summary>
|
||
/// <param name="html">源Html</param>
|
||
/// <param name="s">开始字符串</param>
|
||
/// <param name="e">结束字符串</param>
|
||
/// <returns></returns>
|
||
internal static string GetBetweenHtml(string html, string s, string e)
|
||
{
|
||
string pattern = $"{s}{RegexString.AllHtml}{e}";
|
||
if (Regex.IsMatch(html, pattern, RegexOptions.IgnoreCase))
|
||
{
|
||
Match match = Regex.Match(html, pattern, RegexOptions.IgnoreCase);
|
||
if (match != null && match.Groups.Count > 0)
|
||
{
|
||
return match.Groups[1].Value.Trim();
|
||
}
|
||
}
|
||
return string.Empty;
|
||
}
|
||
|
||
/// <summary>
|
||
/// 提取网页Title
|
||
/// </summary>
|
||
/// <param name="html">Html</param>
|
||
/// <returns>返回Title</returns>
|
||
internal static string GetHtmlTitle(string html)
|
||
{
|
||
if (Regex.IsMatch(html, RegexString.HtmlTitle))
|
||
{
|
||
return Regex.Match(html, RegexString.HtmlTitle).Groups[1].Value.Trim();
|
||
}
|
||
return string.Empty;
|
||
}
|
||
}
|
||
}
|